Feature Engineering 101 


Day 1 


Standardization 


Standardization in Machine Learning 


Standard Deviation = 1 Standard Deviation = 1 
Mean =0 Mean = 0 


Standardize 


980 990 1000 1010 1020 1030 1040 


import numpy as np £ linear algebra 
import pandas as pd £ data processing 


import matplotlib.pyplot as plt 
import seaborn as sns 


df = pd.read csv('Social Network Ads.csv') 


dfzdf.iloc[:,2:] 


df 


Age EstimatedSalary Purchased 


19 


35 


26 


27 


19 


46 


51 


50 


36 


49 


19000 


20000 


43000 


57000 


76000 


41000 


23000 


20000 


33000 


36000 


400 rows x 3 columns 


df.describe() 


count 
mean 
std 
min 
2596 
50% 
75% 


max 


Age EstimatedSalary 


400.000000 


37.655000 


10.482877 


18.000000 


29.750000 


37.000000 


46.000000 


60.000000 


400.000000 


69742.500000 


34096.960282 


15000.000000 


43000.000000 


70000.000000 


88000.000000 


150000.000000 


0 


0 


Purchased 


400.000000 


0.357500 


0.479864 


0.000000 


0.000000 


0.000000 


1.000000 


1.000000 


Now Train Test Spilit 


from sklearn.model selection import train test split 


X train, X test, Y train, Y test = train test split (df.drop('Purchased', 


df['Purchsased'], 


axis=1), 


X train.shape, X test.shape 


( (280, 


2), 


(120, 2)) 


Standerdscaler 


test size = 0.3, 
random state=0) 


from sklearn.preprocessing import StandardScaler 


Scal 


# fit the scaler to the train set, 
ler.fit(X train) 


Sca 


er 


StandardScaler() 


# transform train and test sets 
X train scaled = scaler.transform(X train) 
X test scaled = scaler.transform(X test) 


scaler.mean . 


array ([3.78642857e+01, 


X train 

Age 
92 26 
223 60 
234 38 
232 40 
377 42 
323 48 
192 29 
117 36 
47 27 
172 26 


EstimatedSalary 


15000 


102000 


112000 


107000 


53000 


30000 


43000 


52000 


54000 


118000 


280 rows x 2 columns 


X train scaled 


array([ 


O 00000 NkHE- 


.1631724 , 
«17018137, 
.0133054 , 
.20938504, 
.40546467, 
.28081405, 
.99370357, 
«99370357; 


=l. 


5849703 


.93098672 
.22017719 
.07558195 
.48604654 
«31253226 
.8330751 

.8563962 


s 


s 


it will learn the parameters 


6.98071429e+04]) 


.967092 
.307424 
.08473441, 
.06513258, 
.11134522, 
.96709276, 
.67297331, 
.26121221, 
.28081405, 
.08473441, 
.28081405, 
.89566375, 
.28081405, 
.99370357, 
.1631724 , 
26121221; 
.38586284, 
.28782302, 
.28081405, 
.47689368, 
.77101313, 
.99370357, 
.30742485, 
.99370357, 
.67297331, 
.67297331, 
.50350449, 
.07214155, 
.94749093, 
.18277423, 
.40546467, 
.89566375, 


.0133054 , 
.86905295, 
.1631724 , 
.17018137, 
.35925203, 
.40546467, 
.79762394, 
.96709276, 
111348297 
.96709276, 
.30742485, 
.69958412, 
.47689368, 
.7514113 , 
.67297331, 
.40546467, 
.28081405, 
.47689368, 
.20938504, 
.28782302, 
.79762394, 
.28081405, 
.0133054 , 
1824042. 
.08473441, 
.0133054 , 
127018137, 
29514193 5 
.87606192, 
.40546467, 
.1631724 , 
.79762394 
.28081405 
.65337148, 

76 

85 


, 


, 


, 


, 
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. 24909623 
.26126285 
.5849703 
.80415605 
.46929411 
.2901819 
.75747245 
.31253226 
.75747245 
.55503912 
.06341534 
.26686079 
.0233418 
.3526058 
.12125343 
.29476771 
.06341534 
.2901819 
.03449629 
.20342476 
.26584866 
.15017248 
.54388463 
.15017248 
.23692961 
.25469417 
.104501 
.3526058 
.12125343 
.13901799 
.29476771 
.36477242 
.25469417 
.05226085 
.74631796 
.49720103 
.06442747 
.58395817 
.80415605 
.53828669 
.39369146 
.49720103 
.03449629 
.00557724 
.89091319 
.35361793 
.2323438 
.97207239 
.46828198 
.26584866 
1.97207239 
.35361793 
.28361322 
1.24909623 
1.07558195 
.06442747 
.29476771 
.75747245 
.49821316 
.03449629 
.71180097 
.17909152 
.74631796 
.39369146 
.58395817 
L.1511846 


.1631724 , 
.20938504, 
.79762394, 
.07214155, 
.79762394, 
.28081405, 
.96709276, 
.20938504, 
.07214155, 
1.87606192, 

.38586284, 
.89566375, 
.48390265, 
.28081405, 
.97410174, 
.69958412, 
.45729185, 
.79762394, 
.40546467, 
.96709276, 
.0133054 , 
.1631724 , 
.86905295, 
.08473441, 
.55533166, 
.99370357, 
.09174339, 
.0133054 , 
.06513258, 
.79762394, 
.99370357, 
1631124 ; 
.28081405, 
.11134522, 
.35925203, 
.5749335 , 
.79762394, 
.28081405, 
JTTBDAST 4 
.89566375, 
.20938504, 
.5749335 , 
1.84945111, 
.26121221, 
.28081405, 
.96709276, 
.1897832 , 
.20938504, 
.1897832 , 
.28081405, 
.20938504, 
.79762394, 
11194529. 
J7514113-; 
.86905295, 
.67297331, 
.89566375, 
.20938504, 
.37885386, 
.0133054 , 
.40546467, 
.89566375, 
.11134522, 
.84945111, 
.08473441, 
18213423, 
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«715237 

.23692961 
.31253226 
.80415605 
.12125343 
.61287722 


1253226 
7037036 
1666762 
6686079 
1983223 


.24909623 
.11666762 
.23794174 
.90206768 
.71739891 
.3526058 
.35361793 
.13901799 
.41044389 
.31253226 
.41044389 
.2090227 
.03449629 
.42820845 
L.00658937 
.2090227 
.13901799 
L.52713221 
.2090227 
.05882953 
.52113221 
.78639149 
.31253226 
L.23794174 
.49821316 
.52612008 
.34145131 
.28361322 
.03550842 
.06341534 
.87314863 
. 41145602 
.58395817 
.52612008 
1.09334651 
.44037507 
.31253226 
.74631796 
.06341534 
.08774857 
.09334651 
.03449629 
.12125343 
.15017248 
.17909152 
.29577984 
.25469417 
.22017719 
.29476771 
.15017248 
.65956082 
.15017248 
.29577984 
.29476771 
.28361322 


.30742485, 
.18277423, 
.99370357, 
1893423. 
1.28782302, 

.06513258, 
.0133054 , 
.11134522, 
.55533166, 
.47689368, 
.99370357, 
1.97410174, 

.48390265, 
.5749335 , 
.58194247, 
777101313; 
1.97410174, 

.1631724 , 
.69958412, 
.35925203, 
.20938504, 
.47689368, 
.6015443 , 
.55533166, 
.47689368, 
.50350449, 
.35925203, 
.79762394, 
.28081405, 
.58194247, 
.99370357, 
.28081405, 
.08473441, 
.45729185, 
.08473441, 
.67297331, 
.47689368, 
.699584 
TISS: 
.84945111, 
.09174339, 
.11134522, 
.28081405, 
.11134522, 
.35925203, 
.307424 
.86905295, 
.58194247, 
.28081405, 
.08473441, 
.86905295, 
.67297331, 
.405464 
2101313, 
.38586284, 
.1897832 , 
1980521 y 
.86905295, 
.77101313, 
.1631724 , 
.47689368, 
.30742485, 
218277425. 
.67998229, 
.99370357, 
.30742485, 


12; 


85, 


67, 
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.51496559 
.59612479 
.18010365 
.62504383 
.8563962 
.37037036 
.03449629 
.25469417 
.23794174 
.28361322 
.12125343 
.35361793 
.06341534 
.36477242 
.00557724 
«29476771 
. 7285534 
.51496559 
.26584866 
.42820845 
.15017248 
.2090227 
.00099143 
.49821316 
.54388463 


1.82747716 
1.09334651 


.38253697 
.42820845 
.98882482 
.42261051 
.48604654 
.14558666 
.11009894 
.94315334 
.34145131 
.8330751 


L.38253697 
L.5849703 


.46929411 
.12125343 
.50936765 
.09233438 
.03449629 
.35361793 
.06341534 
.38152485 
.26686079 
.74631796 
.15017248 
.65956082 
.05226085 
.45712749 


1.88531525 
.27801528 


.97767033 
.82747716 
.25469417 
.55503912 


L.55605125 


.12226556 
.06341534 


L.06442747 
1.59612479 
.76963906 


.03449629 


£77101313; 
.08473441, 
.30742485, 


1.97410174, 


.77101313, 


1.7514113 , 


.08473441, 
30742485, 


1.09174339, 


.96709276, 


1.48390265, 


.20938504, 
.17018137, 
.28081405, 
.65337148, 
.0133054 , 
.11134522, 
.08473441, 
.1631724 , 
.28081405, 
.58194247, 
217101313, 
211154822. 
.86905295, 
.47689368, 
.28081405, 
.30742485, 
.30742485, 
11134520, 
.06513258, 
.65337148, 
.1631724 , 
.67297331, 
.11134522, 
.30742485, 
.89566375, 
.30742485, 
.08473441, 
.17018137, 
:26191221; 
.96709276, 
.0133054 , 
.18277423, 


ELISA , 


1. 7780221. ; 
.20938504, 
.40546467, 


STATER , 


.20938504, 
.89566375, 


1.94749093, 


.28081405, 


1.87606192, 


.37885386, 


L.09174339, 


1.06513258, 
1.84945111, 
.11134522, 


1.1631724 , 


1.26121221, 
.96709276, 


1.67998229, 


1.1897832 , 
1.09174339, 
1.38586284, 

.28081405, 


O: + 
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225 T5433 
.06341534 
.19685608 
.65956082 
.33585337 
.60172273 
.12125343 
.31253226 
.55503912 
.26584866 
.3526058 
.37037036 


1.03550842 
1.104501 
.06341534 


.03449629 


1.04666291 
.37037036 


.06341534 


1.35361793 
1.104501 
1.52713221 
1.8563962 
«115237 


.715237 
.91983223 
.71739891 
.06341534 
.8563962 
.94315334 
.55605125 
.09334651 
.11009894 
.09233438 
.26584866 
.57280368 
.1511846 
.67071531 
.68847986 
.38253697 
.94875128 
.42820845 
.45712749 
.97767033 
.98882482 
.37037036 


1.104501 

1.35361793 
.13901799 

1.44037507 
.46828198 


.26584866 
.06442747 
.06341534 


0.89091319 


.12226556 
.00557724 
.26584866 
.32368675 
.29476771 
.43936294 
.89091319 
.52612008 
.52612008 
.31910094 
.13901799 


.40546467, -0.4571 
.37885386, -0.775237 

.08473441, -0.51496559 
99370357, =1,1511 
.86905295, -0.775237 

.18277423, -0.51496559 
.06513258, -0.4571 
.1631724 , 1.39369146 


12749 


1846 


12749 


X train scaled = pd.DataFrame(X train scaled, columns=X train.columns) 


X test scaled = pd.DataFrame(X test scaled, 


X test scaled 


117 


118 


119 


Age 
-0.771013 
0.013305 
-0.280814 
-0.771013 


-0.280814 


1.091743 
0.699584 
-0.672973 
0.797624 


0.895664 


EstimatedSalary 
0.497201 
-0.572804 
0.150172 
0.265849 


-0.572804 


-0.139018 
1.769639 
0.555039 
0.352606 


-0.543885 


120 rows x 2 columns 


X train scaled 


275 


276 


277 


278 


279 


Age 
-1.163172 
2.170181 
0.013305 
0.209385 


0.405465 


0.993704 
-0.869053 
-0.182774 
-1.065133 


-1.163172 


EstimatedSalary 
-1.584970 
0.930987 
1.220177 
1.075582 


-0.486047 


-1.151185 
-0.775237 
-0.514966 
-0.457127 


1.393691 


columns=X test.columns) 


280 rows x 2 columns 


np.round(X train.describe(), 1) 


count 
mean 
std 
min 
2596 
50% 
7596 


max 


np.round(X train scaled.describe(), 


count 


mean 


std 


min 


25% 


50% 


75% 


max 


Age EstimatedSalary 


280.0 280.0 
37.9 69807.1 
10.2 34641.2 
18.0 15000.0 
30.0 43000.0 
37.0 70500.0 
46.0 88000.0 
60.0 150000.0 


Age EstimatedSalary 


280.0 280.0 
0.0 0.0 
1.0 1.0 

-1.9 -1.6 
-0.8 -0.8 
-0.1 0.0 
0.8 0.5 
2.2 2.3 


Now set the scaling 


ELG, 


axl. 
axl. 
ax2. 
ax2s 
plt. 


(axl, ax2) 


scatter (X_train['Age'], X train['! 
set title ("Before Scaling") 
scatter(X train scaled['Age'], X 


1) 


= plt.subplots(ncols=2, figsize=(12, 5)) 


EstimatedSalary']) 


set title("After Scaling") 


show () 


train scaled['EstimatedSalary'],colorz'red') 


Before Scaling : After Scaling 
2. 


140000 


120000 


100000 


80000 


60000 


40000 


20000 


fig, (axl, ax2) = plt.subplots(ncols=2, figsize=(12, 5)) 


# before scaling 

axl.set title('Before Scaling") 

sns.kdeplot(X train['Age'], ax=ax1) 
sns.kdeplot(X train['EstimatedSalary'], ax=ax1) 


# after scaling 

ax2.set title('After Standard Scaling") 
sns.kdeplot(X train scaled['Age'], ax=ax2) 
sns.kdeplot(X train scaled['EstimatedSalary'], ax-ax2) 
plt.show() 


Before Scaling After Standard Scaling 
0.040 


0.035 
0.030 
0.025 


0.020 


Density 
Density 


0.015 
0.010 


0.005 


000 
-25000 O 25000 50000 750001000002500050000175000 
Age 


Comparison of Distributions 


#Age Dist. 
fig, (axl, ax2) = plt.subplots(ncolsz2, figsize=(12, 5)) 


# before scaling 
axl.set title('Age Distribution Before Scaling") 
sns.kdeplot(X train['Age'], ax=ax1) 


# after scaling 

ax2.set title('Age Distribution After Standard Scaling") 
sns.kdeplot(X train scaled['Age'], ax=ax2) 

plt.show() 


#Salry Dist. 
fig, (axl, ax2) = plt.subplots(ncols=2, figsize=(12, 5)) 


# before scaling 
axl.set title('Salary Distribution Before Scaling') 
sns.kdeplot(X train['EstimatedSalary'], ax=ax1) 


# after scaling 
ax2.set title('Salary Distribution Standard Scaling") 
sns.kdeplot(X train scaled['EstimatedSalary'], ax-ax2) 


plt.show() 
Age Distribution Before Scaling Age Distribution After Standard Scaling 
0.040 0.40 
0.035 035 
0.030 030 
0.025 0.25 
E S 
a a 
E 0.020 5 020 
a a 
0.015 0.15 
0.010 0.10 
0.005 0.05 
0.000 0.00 
10 20 30 40 50 60 70 -3 -2 -1 0 1 2 3 
Age Age 
le-5 X Salary Distribution Before Scaling Salary Distribution Standard Scaling 
12 
0.40 
10 0.35 
0.30 
0.8 
> > 025 
= = 
É 06 e 
2^ & 020 
04 0.15 
0.10 
0.2 
0.05 
0.0 0.00 
-25000 O 25000 50000 750001000002500015000075000 -2 -1 0 1 2 3 
EstimatedSalary EstimatedSalary 


Importance of Scaling 


From Logistis Regression 


from sklearn.linear model import LogisticRegression 


#Creat lr function 
lr = LogisticRegression() 
r scaled = LogisticRegression() 


FFit the train and scaled datsset of LR 
lr.fit(X train,Y train) 
lr scaled.fit(X train scaled,Y train) 


LogisticRegression() 


#Now it's time to prediction 
y pred = lr.predict(X test) 
y pred scaled = lr scaled.predict(X test scaled) 


#Findout Acc. Score 
from sklearn.metrics import accuracy score 


print("Actual",accuracy score(Y test,y pred)) 
print("Scaled",accuracy score(Y test,y pred scaled)) 


Actual 0.6583333333333333 
Scaled 0.8666666666666667 


From DecisionTreeClassifier 


from sklearn.tree import DecisionTreeClassifier 


#Creat DT function 
dt = DecisionTreeClassifier() 
dt scaled = DecisionTreeClassifier () 


#Fit the train and scaled datsset of LR 
dt.fit(X train,Y train) 
dt scaled.fit(X train scaled,Y train) 


DecisionTreeClassifier() 


#Now it's time to prediction 
y pred = dt.predict(X test) 
y pred scaled = dt scaled.predict(X test scaled) 


#Findout Acc. Score 
from sklearn.metrics import accuracy score 


#It's better than LogisticsRegression 
print("Actual",accuracy score(Y test,y pred)) 
print("Scaled",accuracy score(Y test,y pred scaled)) 


Actual 0.875 
Scaled 0.8666666666666667 


#See the standrization our data 
df.describe() 


Age EstimatedSalary Purchased 

count 400.000000 400.000000 400.000000 
mean 37.655000 69742.500000 0.357500 
std — 10.482877 34096.960282 0.479864 
min 18.000000 15000.000000 0.000000 
2596 29.750000 43000.000000 0.000000 
50% 37.000000 70000.000000 0.000000 
75% 46.000000 88000.000000 1.000000 


max 60.000000 150000.000000 1.000000 


#Now let's check the model. 


#Dummy values 
df = df.append(pd.DataFrame(('Age':[10,85,70], 'EstimatedSalary':[2500,450000,250000], 'Purc 


df 

Age EstimatedSalary Purchased 

0 19 19000 0 

1 35 20000 0 

2 26 43000 0 

3 27 57000 0 

4 19 76000 0 
398 36 33000 0 
399 49 36000 1 
400 10 2500 0 
401 85 450000 1 
402 70 250000 1 


403 rows x 3 columns 


plt.scatter(df['Age'], df['EstimatedSalary']) 


«matplotlib.collections.PathCollection at 0x21a2f4d2790> 


400000 


300000 


200000 


100000 


In [35]: from sklearn.model selection import train test split 


X train, X test, Y train, Y test = train test split (df.drop('Purchased', axis=1), 
df['Purchased'], 
test_size = 0.3, 
random_state=0) 

X train.shape, X test.shape 


Out[35]: ((282, 2), (121, 2)) 


In [36]: from sklearn.preprocessing import StandardScaler 


scaler = StandardScaler() 


# fit the scaler to the train set, it will learn the parameters 
scaler.fit(X train) 


# transform train and test sets 
X train scaled = scaler.transform(X train) 
X test scaled = scaler.transform(X test) 


In L37]: scaler.mean_ 


Out[37]: array([3.81489362e+01, 6.98528369e+04]) 


In [38]: X train 


Out[ 38]: Age EstimatedSalary 
179 31 34000 
219 59 143000 
302 37 137000 
49 31 89000 


241 38 59000 


323 48 30000 
192 29 43000 


117 36 52000 


Age EstimatedSalary 


47 


172 


27 


26 


54000 


118000 


282 rows x 2 columns 


X train scaled 


array LL 


-6.60321806e-01, 


1.92593860e+00 

.06123147e-01 
.60321806e-01 
L.37567043e-02 
1.55647283e+00 

.12215402e+00 
.17541284e-01 
1.37173994e+00 
1.00227417e+00 

.74120572e+00 
.01830504e+00 
.30688691e+00 
.25174841e-01 
.83222477e-01 
.37421136e-01 
.00227417e+00 
.52688249e-01 
.17541284e-01 
.32808398e-01 
.37567043e-02 
.90856034e-01 
.60321806e-01 
.55709068e-01 
.98489591e-01 
.75588920e-01 
.32808398e-01 
.63342625e-01 
.55709068e-01 
.90856034e-01 
.55709068e-01 
.98489591e-01 
.67635268e+00 
.37567043e-02 
.25174841e-01 
.67635268e+00 
.32808398e-01 
.55709068e-01 
.90856034e-01 
.09907727e-01 
1.27937350e+00 

.58398624e+00 
.37421136e-01 
.09907727e-01 
L.06123147e-01 
L.02978758e+00 
.00227417e+00 
.37421136e-01 
.60321806e-01 
.21452047e+ 
.90856034e-01 
.18700706e+00 
.63342625e-01 
.86097388e-02 
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, 


, 


, 


, 


, 
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zu 
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1.80464881e+ 

.65661993e+ 
.72388862e-01 
.67755554e-01 
1.48391956e+ 

.35330070e+ 
.89798878e-01 
.84542567e-01 
1.75530585e+ 

.28654772e+ 
.93118109e-01 
.25461477e+ 
.47717381e-01 
.11384736e+ 
.67755554e-01 
.61185164e-01 
.25674057e-01 
.16475511e-01 
.08191441e+00 
.89798878e-01 
.89798878e-01 
.02316654e-01 
.50345537e-01 
.89798878e-01 
.95267769e+ 
.25674057e-01 
.29736931e-02 
1.95267769e+ 

.26988135e-01 
.15784437e-01 
.26988135e-01 
.99688498e-01 
.18412593e-01 
.45089226e-01 
.99688498e-01 
.13156281e-01 
.19726671e-01 


84542567e-01 
00 
00 


00 
00 


00 
00 


00 


00 


00 


00 


.21731823e-01 
.02316654e-01 
.20527181e+00 
.57122290e-02 
.37827761e-01 
.09214047e-01 
.09214047e-01 
.97060342e-01 


.19726671e-01 
.31121920e+ 
.18786180e+ 
e-01 


.23045901 


00 
00 


.83022126e-02 


.15592885e4 


-00 


.29736931e-02 
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0022741 7e+00 
.01830504e+00 
.12215402e+00 
.17541284e-01 
.21452047e+ 
.98489591e-01 
.37421136e-01 
.55709068e-01 
.75588920e-01 
.5268824 
1.74120572e+00 
L.06123147e-01 
.37567043e-02 
.60321806e-01 
.60321806e-01 
.06123147e-01 
.63342625e-01 
.86108557e+ 
.09907727e-01 
.01830504e+ 
.17541284e-01 
.12215402e+ 
.63342625e-01 
.25174841e-01 
.01830504e+ 
1.83357216e+ 
2.90856034e-01 
.37421136e-01 
.37173994e+ 
.90856034e-01 
L.37567043e-02 
.92593860e+ 
.55709068e-01 
.17541284e-01 
.90856034e-01 
.83357216e+ 
.17541284e-01 
.09907727e-01 
.39925335e+ 
.25174841e-01 
.70976182e-01 
.37421136e-01 
1.37173994e« 
.12215402e+ 
.45054693e-01 
L.06123147e-01 
.49161979e+ 
.32808398e-01 
.90856034e-01 
1.74120572e+ 
.02978758e+ 
.25174841e-01 
L.06123147e-01 
.12215402e+ 
.09907727e-01 
1.09464061e+ 
.3068869le+ 
.67955363e-01 
.90856034e-01 
.90856034e-01 
.17541284e-01 
.18700706e+ 
.70976182e-01 
.67955363e-01 
.76871912e+00 
.21452047e+00 
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.98374420e-01 
.87170722e-01 
.98374420e-01 
.06450439e+00 
.25674057e-01 
.09214047e-01 
.72388862e-01 
.44398151e-01 
.06450439e+00 
.16475511e-01 
.09214047e-01 
.01002576e-01 
1.03983291e+00 
.27928625e+00 
.83022126e-02 
1.26988135e-01 
.69069632e-01 
.37827761e-01 
.21253328e+00 
.84542567e-01 
.83228489e-01 
.62499242e-01 
.50345537e-01 
.67755554e-01 
.41146992e-01 
.15592885e+ 
.43084073e-01 
.67755554e-01 
.80464881e+ 
.17098515e-01 
.83022126e-02 
.47717381e-01 
.63073206e-03 
.15592885e+00 
.05724293e+00 
.68446628e-01 
.67132550e-01 
.45089226e-01 
.996884 
.15592885e+00 
.18412593e-01 
.49031459e-01 
.29736931e-02 
.49031459e-01 
.03257145e+00 
.83022126e-02 
.66441476e-01 
.33885528e-01 
.66441476e-01 
1.02316654e-01 
L.30395774e+ 
L.03257145e« 
L.93741112e-01 
L .30395774e+ 
.58260549e+ 
.37827761e-01 
L.05724293e« 
1.27928625e+ 
.26988135e-01 
.92427034e-01 
.84542567e-01 
.87866325e+ 
.29736931e-02 
.43775148e-01 
.20527181e+ 
.97060342e-01 


00 


00 


98e-01 


00 
00 


00 
00 


00 
00 


00 


00 


.90856034e-01 
.37421136e-01 
.60001711e+00 
1.70976182e-01 
.37567043e-02 
.90856034e-01 
.70976182e-01 
.63342625e-01 
.86097388e-02 
.67635268e+00 
.45054693e-01 
.60321806e-01 
.17541284e-01 
.92593860e+00 
.83222477e-01 
.37567043e-02 
.90856034e-01 
.76871912e+00 
.06123147e-01 
.90856034e-01 
.63342625e-01 
.63342625e-01 
.86097388e-02 
.09907727e-01 
L.98489591e-01 
L.37567043e-02 
L.02978758e+00 
.27937350e+00 
.86097388e-02 
.49161979e+00 
.75588920e-01 
.06123147e-01 
.40441955e-01 
.37567043e-02 
.67955363e-01 
.25174841e-01 
.52688249e-01 
1.83357216e+00 
| .12215402e+00 
1.83357216e+00 
| .30688691e+00 
.70976182e-01 
.75588920e-01 
1.64883927e+ 
.49161979e+ 
.75588920e-01 
.48075511e-01 
.30688691e+ 
.25174841e-01 
1.27937350e« 
L.98489591e-01 
.74120572e+ 
.90856034e-01 
L.06123147e-01 
.39925335e+ 
.86097388e-02 
.60321806e-01 
.75588920e-01 
.32808398e-01 
.52688249e-01 
.76871912e+00 
.00227417e+00 
.37567043e-02 
.90856034e-01 
.70976182e-01 
.09907727e-01 
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.47717381e-01 
.33885528e-01 
.66169420e+00 
.67755554e-01 
.65127398e-01 
.29736931e-02 
.77997733e+00 
.23045901e-01 
.83022126e-02 
1.02316654e-01 
1.26988135e-01 
1.51659615e-01 
L.10658589e« 
1.51659615e-01 

.03983291e+ 
.50345537e-01 
.18412593e-01 
.10658589e+ 
.50345537e-01 
.76331096e-01 
.40455917e-01 
.40455917e-01 
.45089226e-01 
.00789997e+ 
.38523364e+ 
.89798878e-01 
.17098515e-01 
.85856644e-01 
.18412593e-01 
.05724293e+ 
.43084073e-01 
.25674057e-01 
.70596289e+ 
.18412593e-01 
.16319032e+ 
.11842203e-01 
.50345537e-01 
.20417745e-01 
.40455917e-01 
.85856644e-01 
.66441476e-01 
1.26988135e-01 

.03257145e+ 
.43084073e-01 
.27928625e+ 
.65127398e-01 
.55793400e+ 
.33885528e-01 
L.18060033e« 
.68129141e+ 
.43084073e-01 
.43084073e-01 
.15784437e-01 
.82932029e+00 
.50551900e-02 
.26988135e-01 
.92427034e-01 
.11842203e-01 
L.18060033e+00 
L.35330070e+00 
L.25461477e+00 
1.02316654e-01 

.19726671e-01 
.76451736e-02 
.43084073e-01 
.09214047e-01 
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00 
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00 
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00 


00 
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00 
00 


.30688691e+00 
.63342625e-01 
.45054693e-01 
.86097388e-02 
.90856034e-01 
.98489591e-01 
.45054693e-01 
.60321806e-01 
.55709068e-01 
.52688249e-01 
.27937350e+00 
.09464061e+00 
.64883927e+00 
.45054693e-01 
.52688249e-01 
.12215402e+ 
.75588920e-01 
.48075511e-01 
.32746612e+ 
.55647283e+ 
.09907727e-01 
.63342625e-01 
.5268824 
.06123147e-01 
.86097388e-02 
.46410639e+ 
.5268824 
1.67635268e+ 
.06123147e-01 
.63342625e-01 
.00227417e+ 
.37421136e-01 
1.37173994e« 
1.70976182e-01 

.70976182e-01 
.90856034e-01 
.58398624e+00 
.01830504e+00 
.86097388e-02 
1.46410639e+00 

.12215402e+00 
.90856034e-01 
.37567043e-02 
.52688249e-01 
.86097388e-02 
.45054693e-01 
.75588920e-01 
.90856034e-01 
.63342625e-01 
.63342625e-01 
.86097388e-02 
.02978758e+00 
.58398624e+00 
.12215402e+00 
.60321806e-01 
.25174841e-01 
.63342625e-01 
.92593860e+ 
.63342625e-01 
L.06123147e-01 
1.92593860e+ 

.21452047e+ 
.37421136e-01 
L.37567043e-02 
L.98489591e-01 
L.67635268e+00 
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.15592885e+00 
.29736931e-02 
.24359979e-01 
.76451736e-02 
.37827761e-01 
.19103667e-01 
.638 
.571 
SOLL 
1.60727697e+ 

.089 
«351 
.55793400e+ 
.18412593e-01 
.72388862e-01 
.32862922e+ 
.58557008e-01 
.45924808e+00 
.37879333e+00 
1.36056216e+00 

.50859104e+00 
.83022126e-02 
.93741112e-01 
.29736931e-02 
1.76331096e-01 
|.08191441e+ 
.13851884e+ 
.14470359e-01 
.02316654e-01 
.67755554e-01 
.72388862e-01 
.25674057e-01 
.99688498e-01 
.17098515e-01 
.83022126e-02 
.41146992e-01 
.29736931e-02 
.24359979e-01 
.91804031e-01 
.63073206e-03 
.29736931e-02 
.15592885e+00 
.06450439e+00 
.30395774e+00 
.5826054 
.62499242e-01 
.62499242e-01 
.8585664 
.13156281e-01 
.29736931e-02 
.5826054 
.65661993e+00 
.32862922e+00 
.33885528e-01 
.50551900e-02 
.33885528e-01 
.25674057e-01 
.00789997e+ 
.83228489e-01 
.71074784e-01 
.48391956e+ 
.18060033e+ 
.10528125e-01 
.66441476e-01 
.91112956e-01 
.35199606e-01 


13320e-01 
22290e-02 
12956e-01 
00 
17587e+00 
99606e-01 
00 


00 


00 
00 


9e+00 


4e-01 


9e+00 


00 


00 
00 


X train scaled = pd.DataFrame(X train scaled, 
X test scaled = pd.DataFrame(X test scaled, 


.64883927e+00 
.70976182e-01 
.55709068e-01 
1.67635268e+00 
1.70976182e-01 
.17541284e-01 
.86108557e+00 
.90856034e-01 
.86097388e-02 
.83222477e-01 
.09907727e-01 
.02978758e+00 
.76871912e+00 
.86097388e-02 
.12215402e+00 
.21452047e+00 
.37421136e-01 
.55647283e+00 
.09464061e+00 
.00227417e+00 
.27937350e+00 
.90856034e-01 
.55709068e-01 
.83222477e-01 
.06123147e-01 
.09907727e-01 
.45054693e-01 
.98489591e-01 
.02978758e+00 
.12215402e+00 


print (X train scaled) 
print (X test scaled) 


© MM FO 


Age 


.660322 
.925939 
.106123 
.660322 
a e R 
.909908 
.845055 
.198490 
.029788 
.122154 


[282 rows x 2 


E LM M HP O 


Age 


.290856 
.355709 
.567955 
.937421 
.306887 
.290856 
.383222 
.290856 
.106123 
.817541 
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ai . 
.804649 
.656620 
.472389 
.267756 


columns] 


.42461070e-01 
.17098515e-01 
.41146992e-01 
.15592885e+00 
.19726671e-01 
.22994329e+00 
.98374420e-01 
.25674057e-01 
.83022126e-02 
.29736931e-02 
.59871086e-01 
.58557008e-01 
.63073206e-03 
.25674057e-01 
.75017018e-01 
.50345537e-01 
.73702940e-01 
.61185164e-01 
.47717381e-01 
.47717381e-01 
.97734917e+00 
.19726671e-01 
.91112956e-01 
.62499242e-01 
.40455917e-01 
.83228489e-01 
.62499242e-01 
.40455917e-01 
.91112956e-01 
.18786180e+00 


EstimatedSalary 


884543 


. 983228 
.662499 
.440456 
.391113 
.187862 


=1, 
.126988 
.163190 
. 423046 
.292427 


EstimatedSalary 


180600 


.106586 
.106586 
.489799 
.317099 
.514470 


columns=X train.columns) 
columns=X test.columns) 


[121 rows x 2 columns] 


print(np.round(X train.describe(), 1)) 
print(np.round(X train scaled.describe(), 1)) 


Age EstimatedSalary 


count 282.0 282.0 
mean 38.1 69852.8 
std 10.8 40604.7 
min 10.0 2500.0 
25$ 30.0 43000.0 
50% 37.0 68000.0 
75% 46.0 86750.0 
max 85.0 450000.0 

Age EstimatedSalary 
count 282.0 282.0 
mean 0.0 =0:..0 
std 1.0 1.0 
min -2.6 el.7 
25% -0.8 =047 
50% =0 1 -0.0 
75% 0.7 0.4 
max 4.3 9.4 

#A 


fig, (axl, ax2) = plt.subplots(ncols=2, figsize=(12, 5)) 
EstimatedSalary'] 


axl.scatter(X train['Age'], X train[' 
axl.set title("Before Scaling") 


ax2.scatter(X train scaled['Age'], X 
ax2.set title("After Scaling") 
plt.show() 


HB 


train scaled[' 


) 


EstimatedSalary'],color='red') 


fig, (axl, ax2) = plt.subplots(ncolsz2, figsize=(12, 5)) 


# before scaling 
axl.set_title('Before Scaling') 
sns.kdeplot(X train['Age'], ax=ax1) 


sns.kdeplot(X train['EstimatedSalary'], ax=ax1) 


# after scaling 
ax2.set title('After Standard Scaling 


") 


sns.kdeplot(X train scaled['Age'], ax=ax2) 


sns.kdeplot(X train scaled['EstimatedSalary'], ax-ax2) 
plt.show() 

#C 

#Age Dist. 

fig, (axl, ax2) = plt.subplots(ncolsz2, figsize=(12, 5)) 


# before scaling 


axl.set title('Age Distribution Before Scaling") 


sns.kdeplot(X train['Age'], ax=ax1) 


# after scaling 


ax2.set title('Age Distribution After Standard Scaling") 
sns.kdeplot(X train scaled['Age'], ax=ax2) 


plt.show() 


#Salry Dist. 


fig, (axl, ax2) = plt.subplots(ncols=2, figsize=(12 


# before scaling 


axl.set title('Salary Distribution Before Scaling') 


r 9)) 


sns.kdeplot(X train['EstimatedSalary'], ax=ax1) 


# after scaling 

ax2.set title('Salary Distribution Standard Scaling") 
sns.kdeplot(X train scaled['EstimatedSalary'], ax-ax2) 
plt.show() 
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In [44]: 


Age Distribution Before Scaling 


Age Distribution After Standard Scaling 
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#From Logistics regression 
from sklearn.linear model import LogisticRegression 


#Creat lr function 
lr = LogisticRegression() 
lr scaled = LogisticRegression() 


#Fit the train and scaled datsset of LR 
lr.fit(X train,Y train) 
lr scaled.fit(X train scaled,Y train) 


#Now it's time to prediction 
y pred = lr.predict(X test) 
y pred scaled = lr scaled.predict(X test scaled) 


#Findout Acc. Score 
from sklearn.metrics import accuracy score 


print("Actual",accuracy score(Y test,y pred)) 
print("Scaled",accuracy score(Y test,y pred scaled)) 


Actual 0.6446280991735537 


Scaled 0.8512396694214877 


#From DecisionTreeClassifier 
from sklearn.tree import DecisionTreeClassifier 


#Creat lr function 
dt = DecisionTreeClassifier() 
dt scaled = DecisionTreeClassifier () 


FFit the train and scaled datsset of LR 
dt.fit(X train,Y train) 
dt scaled.fit(X train scaled,Y train) 


#Now it's time to prediction 
y pred = dt.predict(X test) 
y pred scaled = dt scaled.predict(X test scaled) 


#Findout Acc. Score 
from sklearn.metrics import accuracy score 


print("Actual",accuracy score(Y test,y pred)) 
print("Scaled",accuracy score(Y test,y pred scaled)) 


Actual 0.8760330578512396 
Scaled 0.8760330578512396 


CL) Conclusion 


G We observed without Outliers: 


Our acc. score for with LR is: Acctual=65 and Scaled=86 
Our accuracy score with DT is Acctual=87 and Scaled=87. 
G We saw this with Outliers: 

Our accuracy score with LR is: Acctual=64 and Scaled=85. 


Our accuracy score with DT is Acctual=87 and Scaled=87. 


G Outliers' Effects on Data: 


1. Lowers quality. Outliers caused by measurement error imply poor data quality. 
2. It skews the data's mean. 
3. This leads to incorrect analyses and misleading insights. 


(F Outlier detection method: 


1. Domain Knowledge 
2. Locate the Z-score 
3. Interquartile Range 
4. Boxplot 5. Scatter plot 


6. Histogram 


7. Clustering techniques Isolation Forest 
9. The Local Outlier Factor 


10.Minimum Covariance Determinant (MCD) 


Examples 


Feature Engineering 101 


Day 2 


Column Transformer 


Original data Column Transformer Transformed data 
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import numpy as np 
import pandas as pd 


from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import OrdinalEncoder 


df = 


df 


pd.read csv('covid toy.csv') 


age gender fever cough city has covid 


0 60 Male 103.0 Mild Kolkata No 
1 27 Male 100.0 Mild Delhi Yes 
2 42 Male 101.0 Mild Delhi No 
3 31 Female 980 Mild Kolkata No 
4 65 Female 101.0 Mild Mumbai No 
95 12 Female 104.0 Mild Bangalore No 
96 51 Female 101.0 Strong Kolkata Yes 
97 20 Female 101.0 Mild Bangalore No 
98 5 Female 98.0 Strong Mumbai No 
99 10 Female 98.0 Strong Kolkata Yes 


100 rows x 6 columns 


df.isnull().sum() 


age 
gender 

fever I 
cough 

city 
has_covid 
dtype: int64 


SO SO O OO 


from sklearn.model selection import train test split 
X train,X test,y train, y test = train test split (df.drop (columns=['has covid']),df['has cc 
test size=0.3) 


X train 


age gender fever cough city 
39 50 Female 103.0 Mild Kolkata 
92 82 Female 102.0 Strong Kolkata 
11 65 Female 98.0 Mild Mumbai 
52 47 Female 100.0 Strong Bangalore 


28 16 Male 104.0 Mild Kolkata 


32 34 Female 101.0 Strong Delhi 
15 70 Male 103.0 Strong Kolkata 
19 42 Female NaN Strong Bangalore 
12 25 Female 99.0 Strong Kolkata 


33 26 Female 98.0 Mild Kolkata 


70 rows x 5 columns 


Long method 


# adding simple imputer to fever col 
si = SimpleImputer () 
X train fever = si.fit transform(X train[['fever']]) 


# also the test data 
X test fever = si.fit transform(X test[['fever']]) 


X train fever.shape 
(70, 1) 
# Ordinalencoding -> cough 


oe = OrdinalEncoder (categories=[['Mild','Strong']]) 
X train cough = oe.fit transform(X train[['cough']]) 


# also the test data 
X test cough = oe.fit transform(X test[['cough']]) 


X train cough.shape 


(70, 1) 

print ("City") 

print (df ['city'].value counts ()) 
print ('Gender') 

print (df ['gender'] .value_counts ()) 
City 

Kolkata 32 

Bangalore 30 

Delhi 22 

Mumbai 16 

Name: city, dtype: int64 

Gender 

Female 59 

Male 41 

Name: gender, dtype: int64 


# Ordinalencoding -> cough 
oe = OrdinalEncoder (categories=[['Mild','Strong']]) 
X train cough = oe.fit transform(X train[['cough']]) 


# also the test data 
X test cough = oe.fit transform(X test[['cough']]) 


X train cough.shape 
(70, 1) 
# OneHotEncoding -> gender,city 


ohe = OneHotEncoder (drop='first',sparse=False) 
X train gender city = ohe.fit transform(X train[['gender','city']]) 


# also the test data 
X test gender city = ohe.fit transform(X test[['gender','city']]) 


X train gender city.shape 

(70, 4) 

# Extracting Age 

X train age = X train.drop(columns-['gender','fever','cough','city']).values 


# also the test data 
X test age = X_test.drop(columns=['gender', "fever"; 'cough', 'city']) .values 


X train age.shape 
(70, 1) 
X train transformed = np.concatenate((X train age,X train fever,X train gender city,X trai 


# also the test data 
X test transformed = np.concatenate((X test age,X test fever,X test gender city,X test coi 


X train transformed.shape 


(70, 7) 


Column TransFromer 


sklearn.compose import ColumnTransformer 
sklearn.pipeline import Pipeline 
sklearn.preprocessing import StandardScaler, OneHotEncoder 


sklearn.linear model import LogisticRegression 


preprocessor - ColumnTransformer( 
transformers-[ 
( , StandardScaler(), [ 


1), 
, OneHotEncoder(), [ 


pipe = Pipeline([ 


, preprocessor), 


, LogisticRegression()) 


pipe.fit(X train, y train) 


y pred = pipe.predict(X test) 


from sklearn.compose import ColumnTransformer 


transformer ColumnTransformer(transformers 
'tnfl',SimpleImputer [ fever 
'tnf2',OrdinalEncoder (categories=[['Mild','Strong'] ['cough' 
'tnf3',OneHotEncoder(sparse-zFalse,dropz'first' ['gender','city'] 


|,remainderz'passthrough' 


transformer.fit transform(X train 
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transformer.transform(X test) 
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int(transformer.fit transform(X train).shape) 
int(transformer.transform(X test).shape) 


print('X Train") 
print(X train) 


print ("x nest”) 
print(X test) 


X Train 


ag gender f 
39 50 Female 
92 82 Female 
11 65 Female 
52 17 Female 
28 16 Male 
32 34 Female 
15 70 Male 
19 42 Female 
12 25 Female 
33 26 Female 


70 rows x 5 colum 
X Test 


V 


103. 
102. 


100. 
104. 


e 
0 

0 
98.0 
0 

0 


101.0 
103.0 


NaN 
99.0 
98.0 


ns] 


ag gender fever 
10 75 Female NaN 
80 14 Female 99.0 
96 51 Female 101.0 
6 14 Male 01.0 
22 71 Female 98.0 
7 20 Female NaN 
79 48 Female 103.0 
89 46 Male 103.0 
78 11 Male 100.0 
42 27 Male 100.0 
20 12 Male 98.0 
61 81 Female 98.0 
48 66 Male 99.0 
18 64 Female 98.0 
64 42 Male 104.0 
9 64 Female 101.0 
2 42 Male 101.0 
84 69 Female 98.0 
27 33 Female 102.0 
5 84 Female NaN 
59 6 Female 104.0 
93 27 Male 00.0 
65 69 Female 02.0 
23 80 Female 98.0 
94 79 Male NaN 
16 69 Female 103.0 
49 44 Male 104.0 
26 19 Female 100.0 
60 24 Female 102.0 
4 65 Female 101.0 


print (X_train.shape) 
print (X_test.shape) 
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city 
Kolkata 
Kolkata 
Mumbai 
Bangalore 
Kolkata 
Delhi 
Kolkata 
Bangalore 
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Kolkata 


Kolkata 
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In [1]: import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
import scipy.stats as stats 
from sklearn.model selection import train test split 
from sklearn.model selection import cross val score 
from sklearn.linear model import LinearRegression 
from sklearn.metrics import r2 score 
from sklearn.preprocessing import PowerTransformer 
In. [2]: df = pd.read csv ("concrete data.csv') 
In [3] 
Ñ df.sample(5) 
Out[3]: Blast Furnace Fly sa Coarse Fine 
iio A h 
Cement Slag Ash Water Superplasticizer Aggregate Aggregate ge Strengt 
964 143.7 170.2 132.6 191.6 8.5 814.1 8053 28 29.87 
536 393.0 0.0 0.0 192.0 0.0 940.6 7856 28 39.60 
919 313.0 0.0 0.0 178.0 8.0 1000.0 8220 28 25.10 
795 525.0 0.0 0.0 189.0 0.0 1125.0 613.0 180 61.92 
232 213.7 98.1 245 181.7 6.9 1065.8 7854 56 50.77 
In [4]: df . shape 
(1030, 9) 


Out[4]: 


df.isnull().sum() 

Cement 0 

Blast Furnace Slag 0 

Fly Ash 0 

Water 0 

Superplasticizer 0 

Coarse Aggregate 0 

Fine Aggregate 0 

Age 0 

Strength 0 

dtype: int64 

df.describe() 

Bless Coarse Fine 
Cement Furnace Fly Ash Water Superplasticizer Age 
Aggregate Aggregate 
Slag 

count 1030.000000 1030.000000 1030.000000 1030.000000 1030.000000 1030.000000 1030.000000 1030.000000 

mean 281.167864 73.895825 54.188350 181.567282 6.204660 972.918932 773.580485 45.662136 
std 104.506364 86.279342 63.997004 21.354219 5.973841 77.753954 80.175980 63.169912 
min 102.000000 0.000000 0.000000 121.800000 0.000000 801.000000 594.000000 1.000000 
2596 192.375000 0.000000 0.000000 164.900000 0.000000 932.000000 730.950000 7.000000 
50% 272.900000 22.000000 0.000000 185.000000 6.400000 968.000000 779.500000 28.000000 
75% 350.000000 142.950000 118.300000 192.000000 10.200000 1029.400000 824.000000 56.000000 
max 540.000000 359.400000 200.100000 247.000000 32.200000 1145.000000 992.600000 365.000000 


x 
I 


= df.drop(columns=['Strength']) 
= df.iloc[:,-1] 


< 
I 


X train, X test, y train, y test = train test _split(X, y, test size=0.2,random state=52) 


Applying Regression without any transformation 


lr = LinearRegression() 


lr.fit(X train,y train) 
y pred = lr.predict(X test) 
r2 score(y test,y pred) 


0.601364413277667 
# Cross checking with cross val score 
lr = LinearRegression() 


np.mean(cross val score(lr,X,y,scoringz'r2"')) 


0.4609940491662866 


lpip install seaborn 
Requirement already satisfied: seaborn in c:\programdata\anaconda3\lib\site-packages (0.1 
1.2) 
Requirement already satisfied: pandas>=0.23 in c:\programdata\anaconda3\lib\site-packages 
(from seaborn) (1.3.4) 
Requirement already satisfied: numpy»-1.15 in c:\programdata\anaconda3\lib\site-packages 
(from seaborn) (1.20.3) 
Requirement already satisfied: matplotlib>=2.2 in c:\programdata\anaconda3\lib\site-packag 
es (from seaborn) (3.4.3) 
Requirement already satisfied: scipy>=1.0 in c:\programdata\anaconda3\lib\site-packages (f 
rom seaborn) (1.7.1) 
Requirement already satisfied: cycler>=0.10 in c:\programdata\anaconda3\lib\site-packages 
(from matplotlib>=2.2->seaborn) (0.10.0) 
Requirement already satisfied: pillow>=6.2.0 in c:\programdata\anaconda3\lib\site-packages 
(from matplotlib>=2.2->seaborn) (8.4.0) 
Requirement already satisfied: python-dateutil>=2.7 in c:\programdata\anaconda3\lib\site-p 
ackages (from matplotlib>=2.2->seaborn) (2.8.2) 
Requirement already satisfied: pyparsing>=2.2.1 in c:\programdata\anaconda3\lib\site-packa 
ges (from matplotlib>=2.2->seaborn) (3.0.4) 
Requirement already satisfied: kiwisolver>=1.0.1 in c:\programdata\anaconda3\lib\site-pack 
ages (from matplotlib>=2.2->seaborn) (1.3.1) 
Requirement already satisfied: six in c:\programdata\anaconda3\lib\site-packages (from cyc 
ler>=0.10->matplotlib>=2.2->seaborn) (1.16.0) 
Requirement already satisfied: pytz>=2017.3 in c:\programdata\anaconda3\lib\site-packages 
(from pandas>=0.23->seaborn) (2021.3) 
WARNING: Ignoring invalid distribution -oblib (c:\programdata\anaconda3\lib\site-packages) 
WARNING: Ignoring invalid distribution -oblib (c:\programdata\anaconda3\lib\site-packages) 
WARNING: Ignoring invalid distribution -oblib (c:\programdata\anaconda3\lib\site-packages) 
WARNING: Ignoring invalid distribution -oblib (c:\programdata\anaconda3\lib\site-packages) 
WARNING: Ignoring invalid distribution -oblib (c:\programdata\anaconda3\lib\site-packages) 
WARNING: Ignoring invalid distribution -oblib (c:\programdata\anaconda3\lib\site-packages) 
# Plotting the distplots without any transformation 
for col in X train.columns: 

plt.figure(figsize=(14,4)) 

plt.subplot (121) 

sns.distplot(X train[col]) 

plt.title(col) 

plt.subplot(122) 

stats.probplot(X train[col], dist="norm", plot=p1t) 

plt.title(col) 

plt.show() 
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either '"displot' (a figure-level function with similar flexibility) or ‘hi 


stplot' 


warnings.warn (msg, 


(an axes-level function for histograms). 


FutureWarning) 
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C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either ‘displot' (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 

warnings.warn(msg, FutureWarning) 
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C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either '"displot' (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 

warnings.warn(msg, FutureWarning) 
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C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either ‘displot (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 

warnings.warn(msg, FutureWarning) 
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C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either 'displot' (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 

warnings.warn(msg, FutureWarning) 
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C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either 'displot' (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 

warnings.warn(msg, FutureWarning) 
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C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either ‘displot' (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 

warnings.warn(msg, FutureWarning) 


In [13]: 


In [14]: 


Out[14]: 
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C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either 'displot' (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 

warnings.warn(msg, FutureWarning) 
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Applying Box-Cox Transform 


pt = PowerTransformer (method='box-cox') 


X train transformed = pt.fit transform(X train+0.000001) 
X test transformed = pt.transform(X test+0.000001) 


pd.DataFrame(í'cols':X train.columns,'box cox lambdas':pt.lambdas ]) 


cols box cox lambdas 


0 Cement 0.192177 
1 Blast Furnace Slag 0.023543 
2 Fly Ash -0.033365 
3 Water 0.729294 
4 Superplasticizer 0.102799 
5 Coarse Aggregate 0.944492 
6 Fine Aggregate 1.912493 


7 Age 0.050675 


# Applying linear regression on transformed data 


lr = LinearRegression() 
lr.fit(X train transformed,y train) 


y pred2 = lr.predict(X test transformed) 


r2 score(y test,y pred2) 


0.8059395299868048 


# Using cross val score 


pt = PowerTransformer (methodz'box-cox') 
X transformed = pt.fit transform(X+0.0000001) 


lr = LinearRegression() 
np.mean(cross val score(lr,X transformed,y,scoringz'r2')) 


0.6658537942219862 


# Before and after comparision for Box-Cox Plot 
X train transformed = pd.DataFrame(X train transformed,columns=X train.columns) 


for col in X train transformed.columns: 
plt.figure(figsize=(14,4)) 

plt.subplot (121) 

sns.distplot(X train[col]) 

plt.title(col) 


ct 


plt.subplot (122) 
.distplot(X train transformed[col]) 
t.title (col) 


LD 
B 
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plt.show() 


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either 'displot” (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 


warnings.warn(msg, FutureWarning) 
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either '"displot' (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 

warnings.warn(msg, FutureWarning) 
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C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either '"displot' (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 


warnings.warn(msg, FutureWarning) 


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either '"displot' (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 


warnings.warn(msg, FutureWarning) 
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C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either 'displot” (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 


warnings.warn(msg, FutureWarning) 


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either 'displot' (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 

warnings.warn(msg, FutureWarning) 
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C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either '"displot' (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 

warnings.warn(msg, FutureWarning) 
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either '"displot' (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 

warnings.warn(msg, FutureWarning) 
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C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either '"displot' (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 

warnings.warn(msg, FutureWarning) 
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either ‘displot' (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 

warnings.warn(msg, FutureWarning) 
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C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either 'displot” (a figure-level function with similar flexibility) or ‘hi 
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Apply Yeo-Johnson transform 


ptl = PowerTransformer() 


X train transformed2 = ptl.fit transform(X train) 
X test transformed2 = ptl.transform(X test) 


lr = LinearRegression() 
lr.fit(X train transformed2,y train) 


y pred3 = lr.predict(X test transformed2) 


print(r2 score(y test,y pred3)) 


pd.DataFrame(í'cols':X train.columns,'Yeo Johnson lambdas':ptl.lambdas }) 


0.8096460862674353 


cols Yeo Johnson lambdas 


0 Cement 0.189513 
1 Blast Furnace Slag 0.010273 
2 Fly Ash -0.140102 
3 Water 0.727681 
4 Superplasticizer 0.271741 
5 Coarse Aggregate 0.944526 
6 Fine Aggregate 1.913745 
7 Age 0.005244 


# applying cross val score 


pt = PowerTransformer () 
X transformed2 = pt.fit transform(X) 


lr = LinearRegression() 


np.mean(cross val score(lr,X transformed2,y,scoringz'r2')) 


0.6834625134285743 


X train transformed2 = pd.DataFrame(X train transformed2,columns=X train.columns) 


X train transformed2 


Blast Furnace 


Cement Slag Fly Ash Water Superplasticizer 
0 -0.009023 0.904728 1.024625 -0.137365 0.776304 
1 -0.604728 -1.052106 1.132351 -1.035647 0.529274 
2 -0.603517 -1.052106 1.124763 -0.030435 0.247086 


3 1.636626 -1.052106 -0.900126 2.111781 =1.233985 


Coarse 
Aggregate 


-0.786372 
1.450927 
1.097752 


-0.504527 


Fine 
Aggregate 


-0.407202 
0.282969 
0.021980 


-2.085586 


Age 


0.104565 
1.248846 
-1.696745 


2.159262 


Blast Furnace Coarse Fine 


Cement Slag Fly Ash Water Superplasticizer Aggregate Agöregäte Age 

4 0.938188 -1.052106 -0.900126 0.425694 0.388456 0.532615 -0.584531 0.104565 
819 -1.697137 1.141106 1.060729 0.825766 0.291024 -1.595598 0.069329 0.104565 
820 0.847477 1.125678 -0.900126 -0.753743 0.934141 -0.341994 -0.273968 -1.696745 
821 1.142699 0.834321 -0.900126 -1.598327 0.915241 -0.315130 0.991109 0.723317 
822 0.271409 -1.052106 -0.900126 0.186935 -1.233985 1.242641 -0.108751 0.104565 
823 0.794462 -1.052106 1.168879 0.567189 0.876763 -2.183395 0.010493 0.104565 


824 rows x 8 columns 


In [23]: # Before and after comparision for Yeo-Johnson 


for col in X train transformed2.columns: 


plt.figure(figsize=(14,4)) 
plt.subplot (121) 
sns.distplot(X train[col]) 
plt.title(col) 
lt.subplot(122) 
sns.distplot(X train transformed2[col]) 
lt.title (col) 
plt.show() 
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warnings.warn(msg, FutureWarning) 
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our code to use either ‘displot' (a figure-level function with similar flexibility) or ‘hi 
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FutureWarning: 


distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either 'displot” (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 
warnings.warn(msg, FutureWarning) 
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distplot' is a deprecated function and will be removed in a future version. Please adapt y 
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In [24]: 


# Side by side Lambdas 
pd.DataFrame(í'cols':X train.columns,'box cox lambdas':pt.lambdas ,'Yeo Johnson lambdas':r 


cols box cox lambdas Yeo Johnson lambdas 


Cement 0.169544 0.189513 

Blast Furnace Slag 0.016633 0.010273 
Fly Ash -0.136480 -0.140102 

Water 0.808438 0.727681 
Superplasticizer 0.264160 0.271741 
Coarse Aggregate 1.129395 0.944526 
Fine Aggregate 1.830763 1.913745 


Age 0.001771 0.005244 


Feature 
Engineering 101 


Topic - 4 


Function 
Transformer 


A 


Sample Code 


sklearn.base BaseEstimator, TransformerMixin 


(BaseEstimator, TransformerMixin): 
(self, factor- ble 


self.factor = factor 


(self, X, y= 
self 


(self, X): 
X * self.factor 


Function Transformer 
import pandas as pd 
import numpy as np 
import scipy.stats as stats 


import matplotlib.pyplot as plt 
import seaborn as sns 


from sklearn.model selection import train test split 
from sklearn.metrics import accuracy score 
from sklearn.model selection import cross val score 


from sklearn.linear model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier 


from sklearn.preprocessing import FunctionTransformer 
from sklearn.compose import ColumnTransformer 


df = pd.read csv('train.csv' ,usecols=['Age','Fare', 'Survived']) 


df 
Survived Age Fare 
0 0 220 7.2500 
1 1 380 71.2833 
2 1 260 7.9250 
3 1 350 53.1000 
4 O 35.0 8.0500 
886 O 27.0 13.0000 


887 1 19.0 30.0000 


Survived Age Fare 


888 O NaN 23.4500 
889 1 26.0 30.0000 
890 0 320 7.7500 


891 rows x 3 columns 


df.isnull().sum() 


Survived 0 
Age 177 
Fare 0 


dtype: int64 


df['Age'].fillna(df['Age'].mean(),inplace=True) 


df.isnull().sum() 


Survived 0 
Age 0 
Fare 0 


dtype: int64 


x 
l 


= dfyilosef[syl:3] 
y = df.iloc[:,0] 


X train, X test, y train, y test = train test split(X,y,test size=0.2,random state=42) 


#AGE>>>>>>>>>>>>>>>>>>>>>>>>>>>> 


plt.figure(figsize=(14,4)) 
plt.subplot (121) 
sns.distplot(X train['Age']) 
plt.title('Age PDF') 


plt.subplot (122) 
ats.probplot(X train['Age'], distz"norm", plot=p1t) 
plt.title('Age 00 Plot') 


LD 
+ 


plt.show () 


re >>>>>>>>>>>>>>>>>>>>>>>>>> 
«figure (figsize=(14,4) ) 
.subplot (121) 

.distplot(X train['Fare']) 
.title('Fare PDF') 


Tu UT $ 
otto 


.subplot (122) 
ts.probplot(X train['Fare'], distz"norm", plot=p1t) 
.title('Fare QQ Plot') 


n 
pc! 
+ &$ ct 


plt.show() 
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distplot' is a deprecated function and will be removed in a future version. Please adapt y 
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warnings.warn(msg, FutureWarning) 
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#Now the classifire time 


clf = LogisticRegression() 
clf2 = DecisionTreeClassifier() 


#Predict nad Fit 


clf.fit(X train, y train) 
clf2.fit(X train, y train) 


y pred = clf.predict(X test) 
y predl = clf2.predict(X test) 


print("Accuracy LR",accuracy score(y test,y pred)) 
print("Accuracy DT",accuracy score(y test,y predl)) 


Accuracy LR 0.6480446927374302 
Accuracy DT 0.664804469273743 


Now use Log TransforMer 


trf = FunctionTransformer (func=np. loglp) 


X train transformed = trf.fit transform(X train) 
X test transformed = trf.transform(X test) 


clf = LogisticRegression() 
clf2 = DecisionTreeClassifier() 


clf.fit(X train transformed,y train) 
clf2.fit(X train transformed,y train) 


y pred = clf.predict(X test transformed) 
y predi = clf2.predict(X test transformed) 


print("Accuracy LR",accuracy score(y test,y pred)) 
print("Accuracy DT",accuracy score(y test,y predl)) 


Accuracy LR 0.6815642458100558 
Accuracy DT 0.6815642458100558 


#With cross val score 
X transformed = trf.fit transform(X) 


clf = LogisticRegression() 
clf2 = DecisionTreeClassifier() 


print ("LR",np.mean (cross val score(clf,X transformed, y, scoring='accuracy',cv=10))) 
print ("DT",np.mean (cross val score(clf2,X transformed, y, scoring='accuracy', cv=10))) 


LR 0.678027465667915 
DT 0.661123595505618 


JFare >>>>>>>>>>>>>>>>>>>>>>>5> >>) >> 55555 
plt.figure (figsize=(14,4) ) 


plt.subplot (121) 
stats.probplot(X train['Fare'], dist="norm", plot=p1t) 
plt.title('Fare Before Log") 


plt.subplot (122) 
stats.probplot(X train transformed['Fare'], distz"norm", plot=p1t) 
plt.title('Fare After Log!) 


plt.show() 


#Age >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 
plt.figure(figsize=(14,4)) 


plt.subplot (121) 
stats.probplot(X train['Age'], dist="norm", plot=plt) 
plt.title('Age Before Log') 


plt.subplot (122) 
stats.probplot(X train transformed['Age'], dist="norm", plot=plt) 
plt.title('Age After Log') 


plt.show() 
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Now use Log TransforMer 


trf2 = ColumnTransformer([('log' FunctionTransformer (np.loglp),['Fare'])],remainder='passt 


X train transformed2 = trf2.fit transform(X train) 
X test transformed2 = trf2.transform(X test) 


clf = LogisticRegression() 
clf2 = DecisionTreeClassifier() 


clf.fit(X train transformed2,y train) 
clf2.fit(X train transformed2,y train) 


y pred = clf.predict(X test transformed2) 
y pred2 = clf2.predict(X test transformed2) 


print ("Accuracy LR",accuracy score (y test, y pred)) 
print ("Accuracy DT",accuracy score (y test, y pred2)) 


Accuracy LR 0.6703910614525139 
Accuracy DT 0.6871508379888268 


X transformed2 = trf2.fit transform(X) 


clf = LogisticRegression() 
clf2 = DecisionTreeClassifier() 


print ("LR",np.mean (cross val score(clf,X transformed2,y,scoring='accuracy',cv=10))) 
print ("DT",np.mean (cross val score(clf2,X transformed2, y, scoring='accuracy',cv=10) ) ) 


LR 0.6712609238451936 
DT 0.6610736579275904 


In [21]: def apply transform(transform): 
X = df.iloc[|:,T123] 
y = df.iloc[:,0] 
trf = ColumnTransformer([('log',FunctionTransformer(transform),['Fare'])],remainder-'rt 
X trans = trf.fit transform(X) 
clf = LogisticRegression() 
print("Accuracy",np.mean(cross val score(clf,X trans,y,scoringz'accuracy',cvz10))) 
plt.figure(figsize=(14,4)) 
plt.subplot (121) 
stats.probplot(X['Fare'], dist="norm", plot=plt) 
plt.title('Fare Before Transform') 
plt.subplot(122) 
stats.probplot(X trans[:,0], dist="norm", plot=p1t) 

lt.title('Fare After Transform') 

plt.show() 

In [22]: _ : 

ds apply transform(np.sin) 


Accuracy 0.6195131086142323 
Fare Before Transform Fare After Transform 
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Feature e 


Engineering 101 


Topic - 5 
a I 
Encoding's 
I. Ordinal 
2. One-Hot 
3. Binary 


4. Target 
5. Helmert 


6. Leave One Out 4 


In [3]: 


Categorical to Continuous 


Feature Encoding 


Label 
Encoding 


Frequency 
Encoding 


import numpy as np 
import pandas as pd 


Binary 
Encoding 


df = pd.read csv('cars.csv') 


df 
brand 
0 Maruti 
1 Skoda 
2 Honda 
3 Hyundai 
4 Maruti 


8123 Hyundai 


8124 Hyundai 


8125 Maruti 
8126 Tata 
8127 Tata 


km driven 
145500 
120000 
140000 
127000 


120000 


110000 
119000 
120000 

25000 


25000 


8128 rows x 5 columns 


Continuous to Categorical 


Unsupervised 
Binning 
One Hot 
Encoding 
Equal Width 
Binning 
Target Mean 
Encoding 
fuel owner selling price 
Diesel First Owner 450000 
Diesel Second Owner 370000 
Petrol Third Owner 158000 
Diesel First Owner 225000 
Petrol First Owner 130000 
Petrol First Owner 320000 
Diesel Fourth & Above Owner 135000 
Diesel First Owner 382000 
Diesel First Owner 290000 
Diesel First Owner 290000 


print (df.isnull().sum()) 


brand 

km driven 
fuel 
owner 


O © © © 


Supervised Binning 


Entropy 
based 
Binning 


selling price 
dtype: int64 


print (d 
print (d 


Fl: Fh 


First Owner 
Second Owner 
Third Owner 


Test Drive Car 


Name: owner, dtype: 


Diesel 4402 
Petrol 3631 
CNG 57 
LPG 38 


Name: fuel, dtype: 


print (df['brand'].value counts ()) 


aruti 
Hyundai 
ahindra 
Tata 
Toyota 
Honda 

Ford 
Chevrolet 
Renault 
Volkswagen 
BMW 

Skoda 
Nissan 
Jaguar 
Volvo 
Datsun 
Mercedes-Benz 
Fiat 

Audi 

Lexus 

Jeep 
Mitsubishi 
Force 

Land 

Isuzu 

Kia 
Ambassador 
Daewoo 

MG 

Ashok 

Opel 
Peugeot 


Name: brand, dtype: 


0 


Fourth & Above Owner 


2448 
1415 
772 
734 
488 
467 
397 
230 
228 
186 
120 
105 
81 
71 
67 
65 
54 
47 
40 
34 
31 
14 


Oo 


HR keh e. LQ L. d BOD 


int64 


'owner'].value counts()) 
'fuel'].value counts ()) 


5289 
2105 
999 
174 
5 


int64 


int64 


Now apply OneHot encoding with pandas 


pd.get dummies (df, columns=['fuel', 'owner']) 


brand km driven selling price fuel CNG fuel Diesel 


fuel LPG fuel Petrol 


owner First 
Owner 


owner Fourth 
& Above 
Owner 


owner Fourth 


brand km driven selling price fuel CNG fuel Diesel fuel LPG fuel Petrol dale & Above EN 

Owner 

0 Maruti 145500 450000 0 1 0 0 1 0 

1 Skoda 120000 370000 0 1 0 0 0 0 

2 Honda 140000 158000 0 0 0 1 0 0 

3 Hyundai 127000 225000 0 1 0 0 1 0 

4 Maruti 120000 130000 0 0 0 1 1 0 
8123 Hyundai 110000 320000 0 0 0 1 1 0 
8124 Hyundai 119000 135000 0 1 0 0 0 1 
8125 Maruti 120000 382000 0 1 0 0 1 0 
8126 Tata 25000 290000 0 1 0 0 1 0 
8127 Tata 25000 290000 0 1 0 0 1 0 


8128 rows x 12 columns 


K-1 OneHot Encoding 


#K-1 means we have drop the two column for this table. for reasone this step is ""Multico. 


pd.get dummies (df, columns=['fuel', 'owner'],drop_first=True) 


owner_Fourth 
owner_Second owner_Test 


brand km driven selling price fuel Diesel fuel LPG fuel Petrol & Above Oner Drive Car 
Owner 

O Maruti 145500 450000 1 0 0 0 0 0 

1 Skoda 120000 370000 1 0 0 0 1 0 

2 Honda 140000 158000 0 0 1 0 0 0 

3 Hyundai 127000 225000 1 0 0 0 0 0 

4 Maruti 120000 130000 0 0 1 0 0 0 
8123 Hyundai 110000 320000 0 0 1 0 0 0 
8124 Hyundai 119000 135000 1 0 0 1 0 0 
8125 Maruti 120000 382000 1 0 0 0 0 0 
8126 Tata 25000 290000 1 0 0 0 0 0 
8127 Tata 25000 290000 1 0 0 0 0 0 


8128 rows x 10 columns 


OneHotEncoding using Sklearn 


from sklearn.model selection import train test split 
X train,X test,y train, y test = train test split (df.iloc[:,0:4],df.iloc[:,-1],test_size=0. 


X train.head() 


brand km driven fuel owner 
6310 Maruti 131111 Diesel First Owner 
2219 Maruti 29000 Petrol First Owner 
4600 X Maruti 85000 Diesel Second Owner 
2167 Maruti 90000 Petrol Second Owner 
5272 Hyundai 80000 Diesel Second Owner 


from sklearn.preprocessing import OneHotEncoder 


ohe = OneHotEncoder (drop='first',sparse=False, dtype=np.int32) 


X train new = ohe.fit transform(X train[['fuel','owner']]) 
X test new = ohe.fit transform(X test[['fuel','owner']]) 


print(X train new.shape) 
print(X test new.shape) 


(5689, 7) 
(2439, 7) 


Now merge X train new and X train table 


np.hstack((X train[['brand','km driven']].values,X train new)) 
array([['Maruti'; 131111, Ly sus, Og 0; 0], 

'Maruti', 29000, 0, ..., 0, 0, 0], 

"Maruti", 85000, L, +244, tf; 0, O], 

'Tata', 15000, 0, ..., 0, 0, 0], 

"Maruti", 32500, Ly ¿sp i, OF Ol, 

'Isuzu', 121000, 1, ..., 0, 0, 0]], dtype-object) 


Now apply OneHot Encoding catagories 
cloumn(Brand) 


counts = df['brand'].value counts () 


df['brand'].nunique() 
threshold = 100 


repl = counts[counts <= threshold].index 


pd.get dummies (df['brand'].replace(repl, 'uncommon')).sample(50) 


BMW Chevrolet Ford Honda Hyundai Mahindra Maruti Renault Skoda Tata Toyota Volkswagen um 


4309 0 0 0 0 0 0 1 0 0 0 0 0 
7010 0 0 0 0 0 0 1 0 0 0 0 0 
10 0 0 0 0 0 0 0 1 0 0 0 0 
5309 0 0 0 0 0 0 0 1 0 0 0 0 
5737 0 0 0 0 0 1 0 0 0 0 0 0 
1502 0 0 0 0 0 0 1 0 0 0 0 0 
7096 0 0 1 0 0 0 0 0 0 0 0 0 
6758 0 0 0 0 0 0 0 1 0 0 0 0 
950 0 0 0 0 1 0 0 0 0 0 0 0 
5624 0 0 0 0 0 0 1 0 0 0 0 0 
736 0 0 0 0 0 0 0 0 0 0 1 0 
4334 1 0 0 0 0 0 0 0 0 0 0 0 
1417 0 0 0 0 1 0 0 0 0 0 0 0 
3088 0 0 0 0 0 0 1 0 0 0 0 0 
172 0 0 1 0 0 0 0 0 0 0 0 0 
6716 0 0 0 0 1 0 0 0 0 0 0 0 
5323 0 0 0 0 0 0 0 0 0 0 1 0 
87 0 0 0 0 0 0 1 0 0 0 0 0 
3132 0 0 0 0 0 1 0 0 0 0 0 0 
6994 0 0 0 0 0 0 1 0 0 0 0 0 
3704 0 0 0 0 1 0 0 0 0 0 0 0 
4783 0 0 0 0 0 0 0 0 0 1 0 0 
1136 0 0 0 0 0 0 0 0 0 0 0 0 
7272 0 0 0 0 0 0 0 0 0 1 0 0 
265 0 0 0 0 0 0 0 0 0 0 1 0 
3477 0 0 1 0 0 0 0 0 0 0 0 0 
2820 0 0 0 0 0 0 1 0 0 0 0 0 
1495 0 0 0 0 0 1 0 0 0 0 0 0 
2598 0 0 0 0 1 0 0 0 0 0 0 0 
1691 0 0 0 0 0 0 0 0 0 0 0 0 
3421 0 0 0 0 1 0 0 0 0 0 0 0 
7319 0 0 0 0 0 1 0 0 0 0 0 0 
2832 0 0 0 0 0 0 0 0 0 0 0 0 


1581 0 0 0 0 1 0 0 0 0 0 0 0 


BMW Chevrolet Ford Honda Hyundai Mahindra Maruti Renault Skoda Tata Toyota Volkswagen um 


6799 0 0 0 0 1 0 0 0 0 0 0 0 
1158 0 0 0 0 0 0 0 1 0 0 0 0 

183 0 0 0 0 1 0 0 0 0 0 0 0 
2067 0 0 0 0 0 0 0 0 0 0 0 1 
7194 0 0 0 0 1 0 0 0 0 0 0 0 
5327 0 0 0 0 1 0 0 0 0 0 0 0 
5192 0 0 0 0 1 0 0 0 0 0 0 0 
5608 0 0 0 0 0 0 0 0 0 0 1 0 
1036 0 0 0 0 0 0 1 0 0 0 0 0 
7077 0 0 0 0 0 0 0 0 0 0 0 0 
6291 0 0 0 0 0 0 1 0 0 0 0 0 
7348 0 0 0 0 1 0 0 0 0 0 0 0 
7267 0 0 0 0 0 1 0 0 0 0 0 0 
3383 0 0 0 0 0 0 1 0 0 0 0 0 
7216 0 0 0 0 1 0 0 0 0 0 0 0 
5415 0 0 0 0 0 0 0 0 0 1 0 0 


What is Multi-Collinearity? 


Multicollinearity is a phenomenon that occurs when 
two or more independent variables in a multiple 
regression model are highly correlated with each other. 
This means that they are measuring the same or similar 
information, and this can lead to unstable and unreliable 
coefficient estimates. 


When multicollinearity is present, the coefficients of the 
independent variables may change erratically in 
response to small changes in the model or data. This 
makes it difficult to interpret the importance of each 
independent variable and can lead to incorrect 


conclusions about the relationship between the 
variables. 


Multicollinearity can be detected by calculating the 
correlation matrix of the independent variables and 
looking for high correlation coefficients. Variance 
Inflation Factor (VIF) can also be used to detect 
multicollinearity. A VIF value greater than 5 is an 
indication of high multicollinearity. 


To address multicollinearity, one can remove one of the 
highly correlated independent variables from the model, 
or combine them into a single variable. Another 
approach is to use principal component analysis to 
extract a new set of uncorrelated variables from the 
original correlated set. 
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PipiLine in Machine Learning 


Without using Pipeline 


import numpy as np 
import pandas as pd 


from sklearn.model selection import train test split 
from sklearn.impute import Simplelmputer 

from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import MinMaxScaler 

from sklearn.tree import DecisionTreeClassifier 


df = pd.read csv('train.csv') 


df.sample(5) 


Passengerld Survived  Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 
de 
559 560 1 Messemaeken. enale 260 1 O 345572 1740 NaN S 


Mrs. Guillaume 
Joseph (Emma) 


Van Impe, 
419 420 0 3 Miss. female 10.0 0 2 345773 24.15 NaN S 
Catharina 
van Billiard, A/5 
153 154 0 3 Mr. Austin male 40.5 0 2 851 14.50 NaN S 
Blyler 
741 742 0 Cavendish Me le: 360 1 O 19877 7885 C46 S 


Tyrell William 


Passengerld Survived  Pclass 


Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 


Skoog, Mrs. 
167 168 0 Wiliam Anna, ale: AGG 1 4 347088 27.90 NaN S 
Bernhardina 
Karlsson) 
#selcet the important column 
df .drop(columns=['Passengerld','Name','Ticket','Cabin'],inplace=True) 


df.sample(5) 


Survived Pclass Sex 
597 0 3 male 
533 1 3 female 
277 0 2 male 
152 0 3 male 
82 1 3 female 


Age SibSp Parch Fare Embarked 


49.0 


NaN 


NaN 


32:5 


NaN 


O 0.0000 S 
2 22.3583 C 
0 0.0000 S 
0 8.0500 S 
0 7.7875 Q 


Step 1 - Train-Test-Split 


X train,X test,y train,y test 


print(X train.head(2)) 


Pclass Sex Age 
331 1 male 45.5 
133 2 male 23.0 


print(y train.head()) 


331 0 
733 0 
382 0 
704 0 
813 0 


Name: Survived, dtype: 


df.isnull().sum() 


Survived 
Pclass 

Sex 

Age 17 
SibSp 

Parch 

Fare 

Embarked 
dtype: int64 


loo © © A © © o 


SibSp 
0 
0 


int64 


= train test split (df.drop (columns=['Survived']), 


Parch 
0 
0 


df['Survived'], 
test size=0.2, 
random state=42) 


Fare Embarked 
28.5 S 
13.0 S 


# Applying imputation 


si_age = SimpleImputer () 
si embarked = SimpleImputer (strategy='most frequent") 


X train age = si age.fit transform(X train[['Age']]) 


X train embarked = si embarked.fit transform(X train[['Embarked']]) 
X test age = si age.transform(X test[['Age']]) 
X test embarked = si embarked.transform(X test[['Embarked']]) 


print(X train embarked) 
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One-Hot Encoding with Sex and Embarked 


ohe sex = OneHotEncoder (sparse=False, handle unknown='ignore') 
ohe embarked = OneHotEncoder (sparse=False, handle unknownz'ignore') 


X train sex = ohe sex.fit transform(X train[['Sex']]) 
X train embarked = ohe embarked.fit transform(X train embarked) 


X test sex = ohe sex.transform(X test[['Sex']]) 
X test embarked = ohe embarked.transform(X test embarked) 


X train embarked 


array(t[[O.. Dep Lal, 
Qr Os; P 
Qr ‘0%; ; 
Qu Oey ; 
Qr Org P 
Do 0; 1) 


X train.head(2) 


Pclass Sex Age SibSp Parch Fare Embarked 


331 1 male 45.5 0 O 28.5 S 

733 2 male 23.0 0 0 130 S 

X train rem = X train.drop(columns=['Sex','Age','Embarked']) 
X test rem = X test.drop (columns=['Sex','Age','Embarked']) 


Now concatenet the X train embarked column and X train column and also 
same on X test 


X train transformed = np.concatenate((X train rem,X train age,X train sex,X train embarkec 
X test transformed = np.concatenate((X test rem,X test age,X test sex,X test embarked),axi 


X test transformed.shape 


(179, 10) 


clf = DecisionTreeClassifier() 
clf.fit(X train transformed,y train) 


DecisionTreeClassifier() 


y pred = clf.predict(X test transformed) 


y pred 


dtype 


int64) 


1], 


from sklearn.metrics import accuracy score 


accuracy score(y test,y pred) 


0.776536312849162 


import pickle 
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PipiLine in Machine Learning 


Pred. without PipeLine 


import pickle 
import numpy as np 


ohe sex = pickle.load(open('ohe sex.pkl','rb')) 
ohe embarked = pickle.load (open('ohe embarked.pkl','rb')) 
clf = pickle.load(open('clf.pkl1','rb')) 


# Process Flow >>>>>>> Pclass/gender/age/SibSp/Parch/Fare/Embarked 


test input = np.array([2, 'male', 31.0, 0, 0, 10.5, 'S'],dtype=object) .reshape (1,7) 


test input 


array([[2, 'male', 31.0, 0, 0, 10.5, 'S']], dtype-object) 


test input sex = ohe sex.transform(test input[:,1].reshape(1,1)) 


test input sex 


array([[0., 1.11) 


test input embarked = ohe embarked.transform(test input[:,-1].reshape(1,1)) 


test input embarked 


array([[0., 0., 1.]]) 


Step by Step apply Pipeline 


With PipeLine 


import numpy as np 
import pandas as pd 


from 
from 
from 
from 
from 
from 
from 
from 


df = 


Dunn 
NNN NN A AW 


pd 


Lear 
Leak, 
Lear 
Lear 
Lear 
Lear 
Lear 
Lear 


D D D D D 5 3 Q 


.read_ 


df.sample(5) 


.model selection import train test split 
.compose import ColumnTransformer 
.impute import SimpleImputer 
.preprocessing import OneHotEncoder 
.preprocessing import MinMaxScaler 


caev('train.csv') 


Passengerld Survived  Pclass Name 


760 


423 


210 


503 


386 


761 


424 


211 


504 


387 


Garfirth, 
Mr. John 


Danbom, 

Mrs. 

Ernst 

0 3 Gilbert 
(Anna 

Sigrid 

Maria... 


Ali, Mr. 
Ahmed 


Laitinen, 
Miss. 
Kristina 
Sofia 


Goodwin, 
Master. 
Sidney 
Leonard 


#selcet the important column 
df .drop(columns=['Passengerld','Name','Ticket','Cabin'],inplace=True) 


df.sample (5) 


Survived Pclass 


700 


1 


1 female 18.0 1 


Sex Age SibSp Parch 


.pipeline import Pipeline,make pipelin 
.feature selection import SelectKBest,chi2 
.tree import DecisionTreeClassifier 


Sex Age SibSp Parch 


male NaN 


female 28.0 


male 24.0 


female 37.0 


male 1.0 


0 227.5250 


Fare Embarked 


C 


e 


Ticket 


358585 


347080 


SOTON/O.Q. 
3101311 


4135 


CA 2144 


Fare 


14.5000 


14.4000 


7.0500 


9.5875 


46.9000 


Cabin Embarked 


NaN S 
NaN 5 
NaN 5 
NaN S 
NaN S 


Survived Pclass Sex Age SibSp Parch Fare Embarked 


621 1 1 male 42.0 1 0 52.5542 S 
520 1 1 female 30.0 0 0 93.5000 S 

60 0 3 male 22.0 0 0 7.2292 C 
462 0 1 male 47.0 0 O 38.5000 S 


Step 1 - Train-Test-Split 


X train,X test,y train,y test = train test split (df.drop (columns=['Survived']), 
dt[*'Surwtved"],; 
test sizez0.2, 
random state=42) 


print(X train.head(2)) 


Pclass Sex Age SibSp Parch Fare Embarked 
331 1 male 45.5 0 0 28.5 S 
133 2 male 23.0 0 0 13.0 S 


print(y train.head()) 


331 0 
733 0 
382 0 
704 0 
813 0 


Name: Survived, dtype: int64 


df.isnull().sum() 


Survived 
Pclass 

Sex 

Age 17 
SibSp 

Parch 

Fare 

Embarked 
dtype: int64 


loc © © «o oo 


Step.2 Imputation Transformer 


# imputation transformer 
trfl = ColumnTransformer (| 

("impute age',SimpleImputer(),[2]), 

('impute embarked'", SimpleImputer (strategy='most frequent'),[6]) 
],remainderz'passthrough') 


Step.3 One-Hot Encoding 


trf2 = ColumnTransformer (| 


('ohe sex embarked', OneHotEncoder (sparse=False, handle unknownz'ignore'),[1,6]) 
],remainderz'passthrough') 


Step.4 Scaling 


trf3 = ColumnTransformer([ 
('scale',MinMaxScaler(),slice(0,10)) 


1) 


Step.5 Feature Selection 


trf4 = SelectKBest(score func=chi2,k=8) 


Step.6 Train the Model 


trf5 = DecisionTreeClassifier() 


Step.7 Create PipeLine 


pipe = Pipeline([ 
UErsfT,trfly, 
“ERES” EFE, 
TERE, ELES yy 
"trf4',trf4), 
"Er£5b',trfb5) 


ÆR EE E 


Step7.1 Create Pipeline with Make PipeLine 


pipe = make pipeline(trfl,trf2,trf3,trf4,trf5) 


pipe.fit(X train,y train) 


columntransformer-1: ColumnTransformer 


impute age impute embarked remainder 


ohe sex embarked remainder 


i DecisionTreeClassifier: 


pipe.named steps 


('columntransformer-1': ColumnTransformer (remainder-'passthrough', 
transformers-[('impute age', SimpleImputer(), [2]), 
('impute embarked', 
Simplelmputer(strategy-'most frequent'), 
[61) I, 
'columntransformer-2': ColumnTransformer (remainder='passthrough', 
transformers=[('ohe sex embarked', 
OneHotEncoder (handle unknown-'ignore', 
sparse=False), 


Lis 9131394 
'columntransformer-3': ColumnTransformer(transformers-[('scale', MinMaxScaler(), slice(0, 
10, None))]), 
'selectkbest': SelectKBest (k=8, score func=<function chi2 at 0x000002836 
'decisiontreeclassifier': DecisionTreeClassifier()) 


mil 


ODED30>), 


# Display Pipeline 


from sklearn import set config 
set config (display='diagram' ) 


# Predict 
y pred = pipe.predict(X test) 


y pred 

array (ll; O, 0, O0, 1, Qj O, Os O 0, 0, 0 Or Ly O Dy 0, Q0, O, Q0, 20, 0, 
Or Os O, O, 1, 0, O, O, OG O, 1, O, Os Ly O, Og 1, 0, ©, 0, 0, O, 
Ly OD OG Op Op l2 ly Oy O, O, Og Oy Q LO 0,4 0, 0, 0, 0, .0, a 
Oy Or L, Ly Og Lo Lo Ly O 0, Oj 1, Or O, Op 0, 1, 0, Ly 1, l, 1; 
Or Us Oz O, Dy Op O 0, 07-0, OG Oy. OF 0, 1, 0, O, O, O, 0, 0, 1, 
O, Q. 1, 0, O0, Og Ly 1, 0, OG 0; 1, Og 0, O, l4 0, 1, 1; 0% 0, 1, 
Or Uy Lg D. Ly O, 20, O, Or 0, 0L 05 0, 0, O, Oy Lg OG Lp O, 0, 0, 
O, O, O, 0, O, Or Ls O Og 1, O0, 1, Oy O, O Os 1, 0; Oy, 1, ©, 1; 
0, 0, 0], dtype=int64) 


from sklearn.metrics import accuracy score 
accuracy score(y test,y pred) 


0.6256983240223464 


Cross Validation using Pipeline 


f cross validation using cross val score 
from sklearn.model selection import cross val score 
cross val score(pipe, X train, y train, cv=5, scoring='accuracy') .mean() 


0.6391214419383433 


Feature e 


Engineering 101 


Topic - 7 


Handling 
Mixed & Date-Time 
Variables 


A 


Handling Mixed Data in Machine Learning 


In [2]: df = pd.read csv('titanic.csv') 


In [3]: 


df.head() 
out [3]: Cabin Ticket number Survived 
O NaN A/5 21171 5 0 
1 C85 PC 17599 3 1 
2 NaN STON/O2. 3101282 6 1 
3 C123 113803 3 1 
4 NaN 373450 A 0 
In [4]: df['number'].unique() 
outjaje NN UE Ut St, | 
In [5] import matplotlib.pylab as plt 
fig = df['number'].value counts().plot.bar() 
fig.set title('Passengers travelling with') 
plt.show() 
Passengers travelling with 
140 
120 
100 
80 
60 
40 
20 
0 
« w - N + un m 
In [6]: df['number numerical'] = pd.to numeric(df["number"],errorsz'coerce',downcastz'integer') 
df['number categorical'] = np.where(df['number numerical'].isnull(),df['number'],np.nan) 
df.head() 
Out[6]: Cabin Ticket number Survived number numerical number categorical 
O NaN A/5 21171 5 0 5.0 NaN 
1 C85 PC 17599 3 1 3.0 NaN 


2 NaN STON/O2. 3101282 6 1 6.0 NaN 


Ticket number 


113803 3 


373450 A 


df['Cabin'] .unique () 


array ( [nan, 'C85', 'C123', 'E4 
"C23 C25 C27*', 'BT8', ' 
VE GIS, ES, PAS, 7 
'F E69', 'D47', 'B86', 
'A32', 'B4', 'B80', 'A3 
MOST. BITS SHEET, "UB 
"B49"; TDT; "C22 C26', 
'B57 B59 B63 B66', 'C7' 
rey; VEIDE, "D37! BS 
"A34", "C104', "E111", 
"B37*, "C30", *D20', 'B 
"B39', "B22"; 'CB86', "C 
"BAT, "A20"; "DIG", "D 
E58', "C126", 'BJLI', * 
"C62 C64', 'E24', 'C90! 
EI21*', “DIT. TETI p " 
B102', 'B69', 'E49', 
'C148'], dtype-object) 


df['Ticket'].unique() 


array(['A/5 21171', 
'330877', 
'113783', 


"PC 17599' 
'17463', '349 
'A/5. 2151', 

'244373', '345763', '26 
1347077", "2631", "1995 
"PC 17569", 1335611, 
"A./5. 2152", "345764", 
'SC/Paris 2123", '33095 
"2662"; 349237, 

"113509", '19947', 
"2669", "113572", '3697 
"C.A. 29395", 'S.P. 346 
'S.O.C. 14879', 12680"; 
'248738', '364516', '34 
'SO/C 14885', '3101278' 
'343276', '347466', 
"PC: 17754", "PC 17759", 
"35281", "7540", "31012 
"371110*, “L10465", T26 
"STON/O 2. 3101294", "3 
"310372", "C 17369", '2 
'SOTON/O.Q. 3101307', 
'SC/PARIS 2133', '11752 
'STON/O2. 3101279', 
'230080', '244310', 
'A/5. 851', 'Fa 265302' 
"315037', "CA. 2343", 
"363291", 7113505", "PE 
'17764', "350404", 
'SC/PARIS 2131', '23013 
'364849', '349247', 
'368703', '4579', 
'A/5 3540', '347054', 
'SOTON/O.Q. 3101311', 


'C.A 


B51 B53. B55', 


'c47', 


'C.A. 24579', 


"31012951; 


F 
'W.E. 


'A/5. 


'365222', 
'S.0.P. 


"371362", 
"4133", 

"234604", 
"370370"; 


"2699", 
"CC. 


Survived number numerical 


1 3.0 


0 NaN 


6', 'G6', 'C103', 
D33', 'B30', 'C52', ' 
D10 D12', 'D26', 
‘HOT. 'C2', 'E33', 
1', 'D36', 'D15', 
94', 'C125', 'C99', 
'C106', 'C65', 'E36', 
, 'E34', 'C32', 'B18', 
5', 'E50', 'C82', 
iust 'E38', 'D21', * 
79', 'E25', 'D46', 
70', 'A16', 'C101', 
50', 'D9', 'A23', 
'D49', ' 
'B101', 


; “CAST; TES! 
B38'., "B3', "D6"; 
'D28', 'El7', 


, 'STON/O2. 3101282', 
909', '347742', 
'347082', '350406', 
49', '239865', '248698 
0', '330959', '349216' 
'PC 17604 
'2651', '7546', 
gt, 'S.C./A.4, 23567', 
'A/4. 39886', 
. 31026', '2697', 
3', '347088', 
4', 1510128427. "315151 
'1601', '348123', 
5767', '345779', 
'W./C. 6608', 
P. 5734', 'C.A. 2315 
'231919', '244367', 
76', '349207', '343120 
65', '324669', '4136', 
70369', 'PC 17558', 
668', '347061', 
3337', '228414', 
'7534', 'PC 17593', 
'231945', 'C.A. 
1166', '113776', 
"PC 17597', '35851', 
'C.A. 33595', 
'111240', 


1 
F 


F 


17318", 
“PC 17595", 
"315153", "113767" 
"28424", 
"248747", "345770 
1361231"; 
13528", 


6", 


"D56', 
B28', 
"ClIl0t; 
"BIST, 
"COS! 
"C118"; 


"B96 B98', ' 
E12', ' 


number categorical 
NaN 


A 


"AG", 

'C83', 'F33', 

'B58 B60', 'E101', 
'A7', 'C49', 'F4', 

'C78', 'D35', 

'D7', 'A19', 


'C54', 
'C124', 'CO1', 'E40', 


E10', 'E44', 


873", 

'C68', 
'B50', 
B5', 


E63', 'A14', 
'C95', 'B38', 
'A10', 'E68', 
'A26', 'D48', 


'B82 B84', 
'A24', 


!237736*, 
'248706', 


F 


'11668', 


BO 17572", 
"CLA. 
'PC 17605', 


'349208', 
"330932"; 
'SOTON/OQ 392086", 


"349245", 


"A4. 
"349241", 


'STON/O 2. 
'250653', 


L 


'350046', 


18201, TF 663", 
'D45', 'C46', 'D30', 
'D17', 'A36', 
'C50', 'B42', 


'113803', '373450', 
'PP 9549", 
'382652', 
'330923', '113788', 
ese 17601', 
'113789', 
'349253', 
'370371', '14311', 
'2926', 
34651', 'CA 2144', 
'2661', 
'C.A. 331111; 
'374746', 
1113059", 
1343275", 
1374910", 
1349215", 
', "312991", "349249", 
12627", 
54510", 


1 
, 


'2677', 


L 
, 


1 
, 


'364500', 


L 
, 


'27267', 


'C.A. 29178', 
12678"; "347081", 
33112"; "350043", 
"As Sa 11206", 
'SOTON/OQ 392090', 
'347068', '315093', 
3101280', 
"LINE", 
"370365", "111428", 
"PC 17610", 
'3101264', '2628', 


E 
, 


'112277', 
'A/5 21174', 


'250646', 


'367229', '35273', 'STON/O2. 
'W/C 14208', 'SOTON/OQ 392089', 

'19943', 'PP 4348', 'SW/PP 751', 
'237442', 'C.A. 29566', 'W./C. 
'28665', 'SCO/W 1585', '367230', 
'STON/O 2. 3101275', '2694', 

"244252", "362316", "1135148, 
"PC 17585"; "110152", 
'112059', '382649', 'C.A. 
'113798', '250644', 'PC 17596', 

'239853', 'C.A. 2673', '336439', 
'113056', '349239', '345774', 
19877', '11967', 
"2693" 11113781" 
V7AZ2L Ly "BG ILSE, 
"250651", 734924370 'EF.C.QO. 
16966', 'A/5 21172', '349219', 
"1133611. '113043', 'PC 17611', 
'248740', '244361', '229236', 
"CLA. 37671', "315088", 
'237671', '330931', '330980', 
'SOTON/O.Q. 3101310', 
"RE 17477; MITOS; 

'2648', '347069', "PC 17757", 
'349227', '27849', '367655', "SC 
'3101277', '350052', '350407', 
'STON/O 2. 3101289', '341826', 
120106"; '3129927, '349222', 
'343095', '28220', '250652', 
"A/D. 13032", "315082", 
'364851', 'SOTON/O.Q. 392078", 
'STON/O 2. 3101274', '13507', 

'230434', '65306', '33638', 
'113051', '17453', 'A/5 2817', 
VELGE 13531', "371060", 
'A/S 2816', 'SOTON/O.Q. 
'315089', 'SC/AH Basle 541', 
'3101298', '239854', 'A/5 3594', 
'65304', 'SOTON/OQ 3101317', 
'36947', 'C.A. 6212', '350035', 
"41:35, "2636007 1111427"; 
"PC 174735 "PC 17603", 
"349242", '12749', "349252", 
'W./C. 14258', 'PC 17483", 
'315084', '113050', 'PC 17761', 
'2908', '693', 'SC/PARIS 2146', 
'347085', '113807', '11755', 
'218629', 'SOTON/OQ 392082', 
'349205', '2686', '350417', 

'14312', 'A/4. 20589', '358585', 
'STON/O 2. 3101286', '237789', ' 
1149731, Al die 32954, 
'SC/AH 29037', '248727', '2664', 

'111426', '349910', '349246', '11 
370377", '364512', '220845', '31 
"54636", '36963', "219533", 


"19988", 


"3101267"; 


Y SOT 


E 7076', * 
1323951", 
'2653', 


'S.W. 


'STON/O 2. 


3101283', 
'220367', 
‘A/S 21193", 
'26707', 
14263", 
1347071", 
"370129", 


6609", 
"WO 
119928", 

'A/5. 
PCI; 
17248", 
"370375, 


'349206', 
'349236', 
'367226', 

'PC 17485', 

1347470", 

1234818", 

1349225", 

1248733", 

'7267', '] 

'SC/PARIS 2167', 

'2626', ' 

"C 9077, 


'SC/Paris 2163', 
19234', 
'P/PP 3381', 

13529', 


'28403', 
'4] 
'394] 
'28228', 
'347080', 
"141 
'C.A. 
"113794", 
"349240", 
"364506", 


"19952", 
3101306', 
"7593"; 


118787; 
'315086', 
'C 4001", 
1349209", 
12624", 
13101296", 
1364498", 
1244358", 
1345572", 
'ON/O.Q. 
/PP 752', 


13049', 


13804", 
1028", 
'349224", 


3336", 
'230433', 
'347083', 


347464", 


13510, 


10813", 


748", 


STEG 
40", 


'A/4. 


0564", 
18723", 


"1137927"; 


4134", 


"36967", 


243880", 
' 34] 
3101273', 
349214', 
' 507 


' 3349] 


'243847', 
'21440', 


113508", 
"345778", 
'237798', 


1345364", 
17598", ' 
131418", 
12695", 


'STON/O 2. 
'113760', 
'244278', 

'315096', 

'STON/O 2. 

1345773", 

34244", 
1376564", 


'2666', 
"13509", 


"31027", 
'11771', 
'PC 17609', 
'364846', 
'382651', 
'C.A. 
'2700', 
'29104', 
'13568', 
'330979', 
"372622", 
392087", 
"11769", 
12689", 


"113796", 


"2659", 


'13214', 19220521; 

"3499913; '13213', 

'330919', '365226', 
'STON/O 2. 3101285' 
'29750*, “F.C, 
'342826', '4138', 

'17474', '349256', 
'348121', 
'PC 17482', 
'347062', '350048', 


14 


12750; 
1330935", .! 


'PC 17475', 
'113028', 


'237668', 
SO 7 PPS 
'349223', 
'234686' 
"C.A. 


"2672" 5 
'36864' 
atar, 
112233", 


"113800", 


'STON/O 2. 3101 


"11813", 
'349234', 
'236171', '347067', 
"O.A. 3192T*, 


'250649', '11751', 
'2650', 
'384461', '110413', 
"PC 17582', "PC 17760", 
1347073", 
'A/5. 10482", 
1370373", 
1349233", 'PC 
1226593", 'A/5 2466", 
'11767', 'PC 17608', 
'29011', '36928', 
128551", 
1139894. 
'386525', 
'2647', 
'2691', 
14313', 
'113503', 
3101293', 
'350034', 
'240929', 
'28664', '347064', 
3101269', 
1349254", 
12003", "250655", 
'SC/AH 3085", 
'345769', "347076", 
1113786", '65303', 
"17464", 
1111320", "234360", 
'36209', '323592', 
'3460', '350060', 
'A.5. 18509', 
'A/4 45380', 
1330909", 
'SOTON/OQ 3101316", 
34260", "226875", 
1367232", 
12641", "2690", 
'WE/P 5735", 
12620", 
1349251", 
'A/4 48871", 
"PC 17474", 


17612*, 


'345783', 


11', '237565', '13567', 
'A/5 3902', '364848', 
1364511', 
TON/O.Q. 3101305", 
"11753", '350029', 
2%, 19950420, '347743', 
1292", "350050", 


751', PCAs 231 
129751", 
, 312993", 
24580", 
6563", 


"350025", 
"250647", 
"250643", 


E 


12623", 
'A/5 3536", 
1244270", 
1349228", 
1248731", 
1223596", 
"348124", 
1113806", 


14", '349221', "8475", 
15727", "349210", 
119996", 
1239856", "349912", 
1350036", "24160", 
1363592", "35852", 
"PC 17476", 
134218", '36568', 
'315094', "36866", 


In [9] 


Out[9]: 
In [10 

Out[10]: 
In [11]: 


df['cabin num'] = 
df['cabin cat'] = 


'236853', 'STON/O2. 3101271', '239855', '28425', '233639', 
'349201', '349218', '16988', '376566', 'STON/O 2. 3101288', 
'250648', '113773', '335097', '29103', '392096', '345780', 
'349204', '350042', '29108', '363294', 'SOTON/O2 3101272', '2663', 
'347074', '112379', '364850', '8471', '345781', '350047', 
'"S.O./P.P. 3', "2674", '29105', "347078", "383121", "36865", 
'2687', '113501', 'W./C. 6607', 'SOTON/O.Q. 3101312', '374887', 
'3101265', '12460', 'PC 17600', '349203', '28213', '17465', 
'349244', '2685', '2625', '347089', '347063', '112050', '347087', 
'248723', '3474', '28206', '364499', '112058', 'STON/O2. 3101290', 
'S.C./PARIS 2079', 'C 7075', '315098', '19972', '368323', '367228', 
"2611", "347468", *2223', "PC 17756', '315097*, "392092", "11174%, 
'SOTON/O2 3101287', '2683', '315090', 'C.A. 5547', '349213', 
"347060", "PC 17592', '392091', "113055', "2629", "350026"; 
'28134', '17466', '233866', '236852', 'SC/PARIS 2149', 'PC 17590', 
"345777", '349248', '695', '345765', '2667', '349212', '349217', 
'349257', '7552', 'C.A./SOTON 34068', 'SOTON/OQ 392076', '211536', 
'112053', '111369', '370376'], dtype-object) 


Ticket number 


df.head() 
Cabin 
0 NaN A/5 21171 
1 C85 PC 17599 
2 NaN STON/O2. 3101282 
3 C123 113803 
4  NaN 373450 


d£['Cabin'].str.extract(' (1d+)') 
df "Cabin" |] str [0] 


Survived number numerical 


0 


1 


5.0 


3.0 


6.0 


3.0 


NaN 


df['cabin cat'].value counts().plot(kindz'bar') 


<AxesSubplot:> 


60 


50 


10 


# extract the last 


df. ticket num] 
dEi ticker num") 


bit of ticket as number 
df['Ticket'].apply (lambda s: 
pd.to numeric(df['ticket num'], 


errors='coerce', 


downcast= 


'integer') 


# extract the first part of ticket as category 


number categorical 


NaN 


NaN 


s.split()[-11) 


# captures numerical part 
# captures the first letter 


cabin num 
NaN 


85 


cabin cat 


icket cat'] 
icket cat'] 


df.head(20) 


Cabin Ticket number Survived number numerical number categorical 
O NaN es 5 0 5.0 NaN NaN 
1 C85 PC 17599 3 1 3.0 NaN 85 
2 NaN eu 6 1 6.0 NaN NaN 
3 C123 113803 3 1 3.0 NaN 123 
4 NaN 373450 A 0 NaN A NaN 
5 NaN 330877 2 0 2.0 NaN NaN 
6 E46 17463 2 0 2.0 NaN 46 
7 NaN 349909 5 0 5.0 NaN NaN 
8 NaN 347742 1 1.0 NaN NaN 
9 NaN 237736 A 1 NaN A NaN 
10 G6 | PP9549 1 1.0 NaN 6 
11 C103 113783 1 1.0 NaN 103 
12 NaN A/5. 2151 3 0 3.0 NaN NaN 
13 NaN 347082 3 0 3.0 NaN NaN 
14 NaN 350406 5 0 5.0 NaN NaN 
15 NaN 248706 3 1 3.0 NaN NaN 
16 NaN 382652 3 0 3.0 NaN NaN 
17  NaN 244373 2 1 2.0 NaN NaN 
18 NaN 345763 5 0 5.0 NaN NaN 
19 NaN 2649 4 1 4.0 NaN NaN 
df [| "ticket cat'].unigue(t) 
array([*A/5!, "PO", "STON/02."; han, PP", 'A/5."', 'C.A.', TAs baty 
"SC/Paris"', "SJELA. Ey VANN, “SCAT. “SePetp "SO Cet, "SO/E", 
"W./C.', 'SOTON/OQ'. 'W.E.P.'. "STON/O', *A4.', “CT, SOTON/O.Q.', 
SC/PARIS'; 'SO.BP.', T'A.5D.', MES y "OAS, "LINE", 'EC.C.', WC, 
'SW/PP', 'SCO/W', 'P/PP', 'SC', 'SC/AH', 'A/S', 'A/4', 'WE/P', 


'S.W./PP', 


df['Ticket'].apply (lambda s: 
np.where(df['ticket cat'].str.isdigit(), np.nan, 
dt['ticket cat") 


'C.A./SOTON'], 


"SOS BREST. 
dtype=object) 


df['ticket num'] .unique () 


array LL 


WN WW NY 


.117100e4 
.734500e4 
.477420e4 
.151000e4 


+04, 
+05, 
+05, 
+03, 


.826520e4 


HOS, 


NUNUH 


.759900e+ 
.308770e+ 
.377360e+ 
.470820e+ 
.443730e+ 


'"F.C.', 'SOTON/C2' 


04, 
05, 
05, 
05, 
05, 


3.101282e+06, 
1.746300e+04, 
9.549000e+03, 
3.504060e+05, 
3.457630e+05, 


sesplitiO FOI) 


, 


NO Ho 


cabin num cabin cat ticket num 


'S.C./PARIS', 


.138030e+ 
.499090e+ 
.137830e+ 
.487060e+ 
.649000e+ 


05, 
05, 
05, 
05, 
03, 


NaN 


21171.0 


17599.0 


3101282.0 


113803.0 


373450.0 


330877.0 


17463.0 


349909.0 


347742.0 


237736.0 


9549.0 


113783.0 


2151.0 


347082.0 


350406.0 


248706.0 


382652.0 


244373.0 


345763.0 


2649.0 
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.398650e+ 
.470770e+ 
.492160e+ 
.457900e+ 
.152000e+ 
.166800e+ 
.356700e+ 
.492370e+ 
.926000e+ 
.697000e+ 
.135720e+ 
.661000e+ 
.151510e+ 
.601000e+ 
.487380e+ 
.309320e+ 
.608000e+ 
.474660e+ 
.749100e+ 
.443670e+ 
.540000e+ 
.129910e+ 
.665000e+ 
.101294e+ 
.726700e+ 
.470610e+ 
.284140e+ 
.534000e+ 
.101279e+ 
.500430e+ 
.137760e+ 
.759700e+ 
.343000e+ 
150930e+ 
112400e+ 
133000e+ 
.131000e+ 
.703650e+ 
.346040e+ 
.687030e+ 
.457700e+ 
.470540e+ 
.101311e+ 
.672290e+ 
.181300e+ 
.144000e+ 
.510000e+ 
.374420e+ 
.192100e+ 
.426300e+ 
.470710e+ 
.623160e+ 
.650000e+ 
.304330e+ 
.826490e+ 
.776000e+ 
.703750e+ 
.673000e+ 
.048200e+ 
.492060e+ 
1.196700e+ 

.761200e+ 
.234000e+ 
1.742100e+ 
1.176700e+ 
L.352900e+ 
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.486980e+ 
.631000e+ 
.760100e+ 
.760400e+ 
.457640e+ 
.492530e+ 
.703710e+ 
.101295e+ 
.135090e+ 
.465100e+ 
.697300e+ 
.939500e+ 
.311100e+ 
.481230e+ 
.645160e+ 
.130590e+ 
.920860e+ 
.734000e+ 
.775400e+ 
.492450e+ 
.101276e+ 
.492490e+ 
.246690e+ 
.703690e+ 
.703720e+ 
.492410e+ 
.917800e+ 
.759300e+ 
.652220e+ 
.300800e+ 
.120600e+ 
.585100e+ 
.713620e+ 
.632910e+ 
.101280e+ 
.759500e+ 
.301360e+ 
.114280e+ 
.842400e+ 
.579000e+ 
.101264e+ 
.699000e+ 
.352800e+ 
.527300e+ 
.420800e+ 
.492340e+ 
.117300e+ 
.956600e+ 
.866500e+ 
.101275e+ 
.506490e+ 
1.135140e+ 

.758500e+ 
.844610e+ 
1.724800e+ 
1.137980e+ 

.350200e+ 
.364390e+ 
.130560e+ 
.377980e+ 
.163000e+ 
.693000e+ 
.672260e+ 
.775800e+ 
.760800e+ 
.474700e+ 
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.309230e+ 
1.995000e+ 
1.756900e+ 

.137890e+ 
.651000e+ 
.123000e+ 
.431100e 
.988600e+ 
.994700e+ 
.144000e+ 
70880e+ 
64000e+ 
87900e+ 
92080e+ 
57670e+ 
88500e+ 
32750e+ 
2.315000e+ 
.775900e+ 
92150e+ 
92070e+ 
.711100e+ 
.136000e+ 
.755800e+ 
.736900e+ 
.101307e+ 
.133000e 
.678000e+ 
.319450e+ 
.443100e+ 
.510000e+ 
.920900e+ 
.359500e+ 
.135050e+ 
.776400e+ 
.506530e+ 
.151530e 
.648490e+ 
.500460e+ 
.703700e+ 
.628000e+ 
.672310e+ 
.117400e+ 
.101283e+ 
.920890e+ 
.994300e+ 
.361710e+ 
6.609000e+ 
.585000e+ 
.694000e+ 
.175100e+ 
.336000e+ 
1.101520e+ 


.104130e+ 
.4 


70830e 


70730e 
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.506440e+ 
+05, 
74640e+ 
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.137880e4 
.309590e4 
.356770e4 
.677000e4 
.546000e4 
.309580e4 
.662000e4 
.757200e4 
.102600e4 
.669000e4 
.760500e4 
.101281e4 
.680000e4 
.747460e4 
-457790e4 
.101278e4 
.432760e4 
.645000e4 
.319190e4 
.528100e4 
.431200e4 
.104650e4 
.627000e4 
.451000e4 
.668000e4 
.337000e4 
.175200e4 
.470810e4 
.311200e4 
.166000e4 
.653020e4 
.150370e4 
.470680e4 
.731800e4 
.504040e4 
nan, 
.137670e4 
.492470e4 
.761000e4 
.487470e4 
.540000e4 
.122770e4 
.506460e4 
.438470e4 
.203670e4 
.348000e4 
.470670e4 
.670700e4 
.672300e4 
.992800e4 
-442520e4 
.701290e4 
1.775500e4 
1.120590e4 
1.758200e4 

.759600e4 
.398530e4 
-457780e4 
-457740e4 
.987700e4 
-492330e4 
.998800e4 
.466000e4 
.748500e4 
.492430e4 
.692800e4 
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.696600e+ 
.453640e+ 
.761100e+ 
.487400e+ 
.141800e+ 
.267000e+ 
.457830e+ 
.167000e+ 
.108130e+ 
.176500e+ 
.135030e+ 
.653000e+ 
.676550e+ 
.101277e+ 
.442780e+ 
.137000e+ 
.910600e+ 
.101269e+ 
.822800e+ 
.150820e+ 
.506550e+ 
.765640e+ 
.872300e+ 
.530600e+ 
.137860e+ 
.817000e+ 
1.353100e+ 
1.113200e+ 

.137920e+ 
.410000e+ 
.500600e+ 
.134000e+ 
.101317e+ 
.694700e+ 
.648460e+ 
1.114270e+ 

.747300e+ 
.426000e+ 
.492520e+ 
.425800e+ 
.641000e+ 
.776100e+ 
.908000e+ 
.309790e+ 
.175500e+ 
.186290e+ 
.492050e+ 
.176900e+ 
.585850e+ 
.377890e+ 
1.356700e+ 
3.902000e+ 
.664000e+ 
.114260e+ 
.101305e+ 
.102800e+ 
.463600e+ 
.349120e+ 
.120520e+ 
.492310e+ 
.475000e+ 
.975100e+ 
.101285e+ 
.999600e+ 
.442700e+ 
.138000e+ 
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.117200e+ 
.855100e+ 
.492250e+ 
.443610e+ 
.865250e+ 
.135100e+ 
.376710e+ 
.691000e+ 
.626000e+ 
.101267e+ 
.648000e+ 
.101293e+ 
.748000e+ 
.500520e+ 
.409290e+ 
.150960e+ 
.129920e+ 
.430950e+ 
.457730e+ 
.470800e+ 
.648510e+ 
.085000e+ 
.457690e+ 
.363800e+ 
.530300e+ 
.492400e+ 
.710600e+ 
.343600e+ 
.620900e+ 
.553000e+ 
.101298e+ 
.177100e+ 
.137870e+ 
.212000e+ 
.309090e+ 
.001000e+ 
.760300e+ 
.268750e+ 
.624000e+ 
.748300e+ 
.690000e+ 
.644980e+ 
.930000e+ 
.620000e+ 
.455720e+ 
.920820e+ 
.686000e+ 
.747400e+ 
.438800e+ 
.304900e+ 
.497300e+ 
.648480e+ 
.492140e+ 
.499100e+ 
.703770e+ 
.659000e+ 
.696300e+ 
.704200e+ 
.376680e+ 
.321300e+ 
.309190e+ 
.623000e+ 
.346860e+ 
.975000e+ 
.398560e+ 
.309350e+ 
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.492190e+ 
.113610e+ 
.598000e+ 
.292360e+ 
.767100e 
.695000e+ 
.309310e+ 
.101310e+ 
.431300e+ 
.239510e+ 
.470690e 
.492270e+ 
.137600e 
.504070e+ 
.101289e 
.866400e+ 
.492220e+ 
.822000e+ 
.492540e+ 
.424400e+ 
.920780e+ 
.101274e+ 
.470760e4 
.137940e+ 
.130510e4 
.350900e+ 
.995200e4 
.816000e+ 
.235920e4 
.102700e+ 
.398540e4 
.850900e+ 
.760900e4 
.500350e+ 
.135000e+ 
.826510e+ 
.492090e+ 
.492420e+ 
.700000e+ 
.101296e+ 
.150840e+ 
.356800e+ 
.146000e+ 
.470850e+ 
.726220e+ 
.920870e+ 
.504170e+ 
.431200e+ 
.689000e+ 
.411000e+ 
.235000e+ 
.903700e+ 
.137960e4 
.492460e+ 
.645120e 
.175300e+ 
.195330e 
.477430e+ 
.101292e4 
.314000e+ 
.652260e 
.727000e« 
.129930e 
.275000e+ 
.499120e+ 
.563000e+ 
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.348180e+ 
.130430e+ 
.137840e+ 
.487330e+ 
.150880e+ 
.647000e+ 
.309800e+ 
.076000e+ 
.747700e+ 
.077000e+ 
.775700e+ 
.784900e+ 
.500340e+ 
.840300e+ 
.418260e+ 
.470640e+ 
.941400e+ 
.506520e+ 
.303200e+ 
.003000e+ 
1.105640e+ 

.350700e+ 
.304340e+ 
.666000e+ 
1.745300e+ 

.746400e+ 
.645060e+ 
.101306e+ 
.150890e+ 
.460000e+ 
.594000e+ 
.530400e+ 
.538000e+ 
.150860e+ 
.636000e+ 
.101316e+ 
.696700e+ 
1.274900e+ 
3.672320e+ 
.910400e+ 
.130500e+ 
.735000e+ 
.443580e+ 
.138070e+ 
.492510e+ 
.887100e+ 
.520000e+ 
.058900e+ 
.101286e+ 
.375650e+ 
.101273e+ 
.487270e+ 
.645110e+ 
.138040e+ 
.208450e+ 
.500290e+ 
.492240e+ 
.321400e+ 
.500500e+ 
.492210e+ 
.492230e+ 
.492100e+ 
.536000e+ 
.458000e+ 
.428260e+ 
.492280e+ 


.500360e+05, 
.138000e+05, 
.481210e+05, 
.235960e+05, 
.545000e+03, 
.656800e+04, 
.506430e+05, 
.368530e+05, 
.336390e+05, 
.765660e+05, 
.350970e+05, 
.492040e+05, 
.101272e+06, 
.648500e+05, 
.000000e+00, 
.831210e+05, 
.607000e+03, 
.246000e+04, 
.746500e+04, 
.470890e+05, 
.487230e+05, 
.120580e+05, 
.150980e+05, 
.671000e+03, 
.150970e+05, 
.683000e+03, 
.470600e+05, 
.629000e+03, 
.338660e+05, 
.457770e+05, 
.667000e+03, 
.552000e+03, 
.120530e+05, 


.416000e+04, 
.487310e+05, 
.747500e+04, 
.747600e+04, 
.506470e+05, 
.470620e+05, 
.138060e+05, 
.101271e+06, 
.492010e+05, 
.101288e+06, 
.910300e+04, 
.500420e+05, 
.663000e+03, 
.471000e+03, 
.674000e+03, 
.686500e+04, 
.101312e+06, 
.760000e+04, 
.492440e+05, 
.470630e+05, 
.474000e+03, 
.101290e+06, 
.997200e+04, 
.474680e+05, 
.920920e+05, 
.150900e+05, 
.759200e+04, 
.500260e+05, 
.368520e+05, 
.492480e+05, 
.492120e+05, 
.406800e+04, 
.113690e+05, 


.492560e+05, 2.672000e+03, 
.635920e+05, 3.585200e+04, 
.686400e+04, 3.500250e+05, 
.748200e+04, 1.130280e+05, 
.481240e+05, 3.421800e+04, 
.500480e+05, 1.223300e+04, 
.150940e+05, 3.686600e+04, 
.398550e+05, 2.842500e+04, 
.492180e+05, 1.698800e+04, 
.506480e+05, 1.137730e+05, 
.920960e+05, 3.457800e+05, 
.910800e+04, 3.632940e+05, 
.470740e+05, 1.123790e+05, 
.457810e+05, 3.500470e+05, 
.910500e+04, 3.470780e+05, 
.687000e+03, 1.135010e+05, 
.748870e+05, 3.101265e+06, 
.492030e+05, 2.821300e+04, 
.685000e+03, 2.625000e+03, 
.120500e+05, 3.470870e+05, 
.820600e+04, 3.644990e+05, 
.079000e+03, 7.075000e+03, 
.683230e+05, 3.672280e+05, 
.223000e+03, 1.775600e+04, 
.177400e+04, 3.101287e+06, 
.547000e+03, 3.492130e+05, 
.920910e+05, 1.130550e+05, 
.813400e+04, 1.746600e+04, 
.149000e+03, 1.759000e+04, 
.950000e+02, 3.457650e+05, 
.492170e+05, 3.492570e+05, 
.920760e+05, 2.115360e+05, 
.703760e+05]) 
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Handling Date and Time variable in Machine 
Learning 


import numpy as np 
import pandas as pd 


date 
time 


pd.read csv('orders.csv') 
pd.read csv('messages.csv') 


date.head() 


date product id city id orders 


0 2019-12-10 5628 25 3 
1 2018-08-15 3646 14 157 
2 2018-10-23 1859 25 1 


3 2019-08-17 7292 25 1 


date product id city id orders 


4 2019-01-06 4344 25 3 


time.head() 


date 


msg 


0 2013-12-15 00:50:00 ULLY Ha CerogHa Mykuka 37 


1 2014-04-29 23:40:00 MAPEHb BA NLET APYTA CENYAC!! CMC MMC 0955532826 


2 2012-12-30 00:21:00 AHenp.m 43 no3H.c A/*x *.o 067.16.34.576 


3 2014-11-28 00:31:00 KNEB NLY A/X AO 45 MHE CEUYAC CKYYHO 093 629 9... 


4 2013-10-26 23:11:00 3aa a Te6a Hukorga He o6uxy nro6nto Teba!) Aawe 


print (date.info()) 
print (time.info()) 


<class 'pandas.core.frame.DataFrame'> 
RangeIndex: 1000 entries, 0 to 999 


Data columns (total 4 columns): 
$ Column Non-Null Count Dtype 
0 date 1000 non-nu object 
1 product_id 1000 non-nu int64 
2 city id 1000 non-nu int64 
3 orders 1000 non-nu int64 


dtypes: int64(3), object (1) 
memory usage: 31.4+ KB 
None 
«class 'pandas.core.frame.DataFrame'> 
RangeIndex: 1000 entries, 0 to 999 
Data columns (total 2 columns): 

$ Column Non-Null Count Dtype 


0 date 1000 non-nu object 
1 msg 1000 non-nu object 
dtypes: object(2) 
memory usage: 15.8+ KB 
None 


Working with dates 


# Converting to datetime datatype 
date['date'] = pd.to datetime (date['date']) 


date.info() 


«class 'pandas.core.frame.DataFrame'> 
RangeIndex: 1000 entries, 0 to 999 
Data columns (total 4 columns): 


$ Column Non-Null Count Dtype 
0 date 1000 non-nu datetime64 [ns] 
1 product_id 1000 non-nu int64 
2 city id 1000 non-nu int64 


te is weekend'] = np.where(date['date dow name'].isin(['Sunday', 'Saturday']), 1,( 


1. 2j 


C:\Users\HP\AppData\Local\Temp/ipykernel 75188/3985301858.py:23: FutureWarning: Series.dt. 


3 orders 1000 non-null int64 
dtypes: datetime64[ns] (1), int64 (3) 
memory usage: 31.4 KB 

#Extract year---- 

date['date year'] = date['date'].dt.year 

#Extract month---- 

date['date month no'] = date['date'].dt.month 
#Extract month name---- 

date['date month name'] = date['date'].dt.month name() 
#Extract day---- 

date['date day'] = date['date'].dt.day 

#Extract day of week---- 

date['date dow'] = date['date'].dt.dayofweek 
#Extract day name---- 

date['date dow name'] = date['date'].dt.day name () 
#Extract date is weekend?---- 

date['da 

#Extract date week---- 

date['date week'] = date['date'].dt.week 

#Extract quarter---- 

date['quarter'] = date['date'].dt.quarter 

#Extract semester---- 

date['semester'] = np.where(date['quarter'].isin([1,2]), 
weekofyear and Series.dt.week have been deprecated. 


ek instead. 


date['date week'] = date['dat 


'].dt.week 


use Series.dt.isocalendar().we 


date.drop(columns=['product_id','city id','orders']).head() 


date date year 


2019- 
12-10 


2018- 
08-15 


2018- 
10-23 


2019- 
08-17 


2019- 
01-06 


2019 12 
2018 8 
2018 10 
2019 8 
2019 1 


Working with Times 


import datetime 


December 


August 


October 


August 


January 


10 


15 


23 


17 


date month no date month name date day date dow date dow name date is weekend dat 


1 Tuesday 0 
2 Wednesday 0 
1 Tuesday 0 
5 Saturday 1 
6 Sunday 1 


today = datetime.datetime.today() 


today 


datetime.datetime(2023, 1, 29, 23, 41, 23, 6026953) 


today - date['date'] 


0 1146 days 23:41:23.626953 
1 1628 days 23:41:23.626953 
2 1559 days 23:41:23.626953 
3 1261 days 23:41:23.626953 
4 1484 days 23:41:23.626953 
995 1574 days 23:41:23.626953 
996 1515 days 23:41:23.626953 
997 1363 days 23:41:23.626953 
998 1428 days 23:41:23.626953 
999 1202 days 23:41:23.626953 
Name: date, Length: 1000, dtype: timedelta64[ns] 


(today - date['date']).dt.days 


0 1146 
1 1628 
2 1559 
3 1261 
d 1484 
995 1574 
996 LSLS 
997 1363 
998 1428 
999 1202 


Name: date, Length: 1000, dtype: int64 


# Months passed 


np.round((today -date['date']) / np.timedelta64(1, 'M'),0) 


0 38.0 
1 54.0 
2 51.0 
3 41.0 
4 49.0 
995 52.0 
996 50.0 
997 45.0 
998 47.0 
999 40.0 


Name: date, Length: 1000, dtype: float64 


time.info() 


<class 'pandas.core.frame.DataFrame'> 
RangeIndex: 1000 entries, 0 to 999 
Data columns (total 2 columns): 

$ Column Non-Null Count Dtype 


0 date 1000 non-null object 


dtypes: 
memory usage: 


1 


msg 
object (2) 


1000 non-null 


15.8+ KB 


object 


# Converting to datetime datatype 


ct ct ct 


ime['hour'] = 
ime['min' ] 
ime['sec'] 


ime.head() 


date 


2013-12-15 00:50:00 


2014-04-29 23:40:00 


2012-12-30 00:21:00 


2014-11-28 00:31:00 


2013-10-26 23:11:00 


time['date'] = pd.to datetime (time['dat 


*1) 


time['date'].dt.hour 
= time['date'].dt.minute 
= time['date'].dt.second 


msg 


ny Ha ceroaHs Myxuka 37 


MAPEHb BM MLLET APYTA CENYAC!! CMC MMC 0955532826 


AHenp.m 43 no3H.c n/x *.o 067.16.34.576 


KNEB "LUY A/X AO 45 MHE CEUYAC CKYYHO 093 629 9... 


#Extract time part 


time['time'] 


time.head() 


date 


2013-12-15 00:50:00 


2014-04-29 23:40:00 


2012-12-30 00:21:00 


2014-11-28 00:31:00 


2013-10-26 23:11:00 


#Time diff. 


3aa a Te6a Hukorga He o6wxy nro6nto Teba!) Aawe 


= time['date'].dt.time 


msg 


ny Ha ceroaHs Myxuka 37 


MAPEHb BM MLLET APYTA CENYAC!! CMC MMC 0955532826 


AHenp.m 43 no3H.c n/x *.o 067.16.34.576 


KNEB "LUY A/X AO 45 MHE CEUYAC CKYYHO 093 629 9... 


today - time['date'] 


0 3332 days 22 
1 3197 days 00 
2 3682 days 23 
3 2984 days 23 
4 3382 days 00 
995 3971 days 22 
996 3293 days 00 
997 3758 days 00 
998 3874 days 00 
999 3146 days 00 
Name: date, Length: 


# in seconds 


:51223. 
101523; 
220323: 
210323 5 
2302235 


e 
SL 
:04:23. 
at F235 
:16:23. 
1000, 


3aa a Te6a Hukorga He o6uxy nro6nto Teba!) Aawe 


626953 
626953 
626953 
626953 
626953 


626953 
626953 
626953 
626953 
626953 


dtype: 


timedelta64[ns] 


(today - time['date'])/np.timedelta64(1,'s') 


hour 


0 


23 


0 


0 


23 


23 


min 


50 


40 


21 


31 


11 


sec 


time 


00:50:00 


23:40:00 


00:21:00 


00:31:00 


23:11:00 


0 2.879671e+08 
1 2.762209e+08 
2 3.182088e+08 
3 2.579010e+08 
4 2.922066e+08 
995 3.431767e+08 
996 2.845168e+08 
997 3.246915e+08 
998 3.347140e+08 
999 2.718154e+08 


Name: date, Length: 1000, dtype: float64 


# in hours 
(today - time['date'])/np.timedelta64(1,'h') 


79990.856563 
76728.023230 
88391.339896 
71639.173230 
81168.506563 


+ wNHO 


995 95326.856563 
996 79032.456563 
997 90192.073230 
998 92976.123230 
999 75504.273230 
Name: date, Length: 1000, dtype: float64 


Feature | 
Engineering 101 
Topic - 8 
Deal with Missing Data 


1. CCA(Complete Case 
Analysis) 

2. Simple & Frequent 
Imputer(Numerical/Cate 
gorical Data) 

3. Random Sample 
Imputer 

4.KNN or Multivariate 


CCA (Complete Case Analysis) 


df = pd.read csv('data science job.csv') 


df.head() 
enrollee id city 
0 8949 city 103 
1 29725 city 40 
2 11561 city 21 
3 33241 city 115 
4 666 city 162 


city development index gender 


df.isnull().mean()*100 


enrollee id 
city 


city development index 


gender 


relevent experi 


enrol 


LENC 


ed university 


education leve 


major . 
experience 
company size 
company type 
training hours 
target 
dtype: 


float64 


df.shape 


(19158, 13) 


col 
col 


['city development index", 


discipline 


'enrolled university', 


'education level', 


'experience", 


"training hours'] 


df [cols] .sample(5) 


city development index enrolled university education level 


2179 


11724 


0.899 


0.884 


N 


LA 
C LQ M © © & NM ON © © NM © O 


CO w 


0.920 


0.776 


0.624 


0.789 


0.767 


.000000 
.000000 
.500261 
.530640 
.000000 
.014824 
.401086 
.683161 
.339284 
.994885 
.049274 
.998330 
.000000 


no enrollment 


no enrollment 


Male 


Male 


NaN 


NaN 


Male 


relevent experience 


Has relevent 
experience 


No relevent 
experience 


No relevent 
experience 


No relevent 
experience 


Has relevent 
experience 


[var for var in df.columns if df[var].isnull().mean() 


Graduate 


Graduate 2 


enrolled university 


no enrollment 


no enrollment 


Full time course 


NaN 


no enrollment 


experience training hours 


8.0 54.0 


0.0 NaN 


education level 


Graduate 


Graduate 


Graduate 


Graduate 


Masters 


< 0.05 and df[var].isnull() 


«1 


city development index 


enrolled university 


7189 0.897 no enrollment 
748 0.920 no enrollment 
13129 0.920 no enrollment 
df['education level'].value counts () 
Graduate 11598 
Masters 4361 
High School 2017 
Phd 414 
Primary School 308 


Name: education level, dtype: int64 


len(df[cols].dropna()) 


0.8968577095730244 


/ len(df) 


new df = df[cols].dropna() 
df.shape, new df.shape 


((19158, 13), (17182, 


new df.hist(bins=50, density-True, 


plt.show() 


5)) 


education level 
Masters 
Graduate 


Graduate 


figsize=(12, 12)) 


experience 
16.0 
20.0 


20.0 


training hours 
156.0 
55.0 


3.0 


In [12]: 


city development index 


05 0.6 0.7 0.8 09 


training hours 


0.012 
0.010 
mil 


100 150 200 250 300 


fig = plt.figure() 
ax = fig.add subplot(111) 


# original data 


04 


03 


02 


01 


00 


experience 


00 25 50 


75 100 125 150 175 200 


df['training hours'].hist(bins=50, ax=ax, density=True, color='red') 


# data after cca, the argument alpha makes the color transparent, 


# see the overlay of the 2 distributions 


SO we can 


new df['training hours'].hist(bins=50, ax=ax, color='green', density=True, alpha=0.8) 


plt.show() 


0.012 


0.010 


0.008 


0.006 


0.004 


0.002 


0.000 


250 300 350 


In [13]: | sig = plt.figure() 
ax = fig.add subplot(111) 
# original data 
df['training hours'].plot.density(color='red') 
# Data After CCA 
new df['training hours'].plot.density (color='green') 
plt.title('Data After CCA') 
plt.show() 
Data After CCA 

0.012 

0.010 

0.008 
S 
^ 0.006 
v 
a 

0.004 

0.002 

0.000 

-200 -100 0 100 200 300 400 500 
In [14]: 


fig = plt.figure() 
ax = fig.add subplot(111) 


# original data 
df['city development index'].hist(bins=50, ax=ax, density=True, colorz'red') 


# data after cca, the argument alpha makes the color transparent, so we can 
# see the overlay of the 2 distributions 
new df['city development index'].hist (bins=50, ax=ax, color='green', density=True, alpha=( 


<A lot:> 
Out[14]: xesSubplot 


35 


30 
25 
15 
10 
5 
0 
0.5 0.6 0.7 0.8 0.9 
In [15]: | sig = plt.figure() 
ax = fig.add subplot(111) 
# original data 
df['city development index'].plot.density(colorz'red') 
# data after cca 
new df['city development index'].plot.density (color='green") 
plt.title('Data After CCA') 
plt.show() 
Data After CCA 
12 
10 
8 
v 
à 
4 
2 
0 
02 04 0.6 0.8 10 12 
In [16]: 


fig = plt.figure() 
ax = fig.add subplot(111) 


# original data 
df['experience'].hist (bins=50, ax=ax, density=True, color='red') 


# data after cca, the argument alpha makes the color transparent, so we can 
# see the overlay of the 2 distributions 
new df['experience'].hist(bins-50, ax=ax, color='green', density=True, alpha=0.8) 


< HP 
Out[16]: AxesSubplot 


In [17]: 


In [18]: 


Out[18]: 


04 


0.3 


0.2 


0.1 


0.0 
0.0 25 5.0 75 100 125 150 175 200 


fig = plt.figure() 
ax = fig.add subplot(111) 


# original data 
df['experience'].plot.density(colorz'red') 


# data after cca 

new df['experience'].plot.density(color='green') 
plt.title('Data After CCA') 

plt.show() 


Data After CCA 


0.08 


0.02 


0.00 


temp = pd.concat ([ 
# percentage of observations per category, original data 
df['enrolled university'].value counts() / len(df), 


# percentage of observations per category, cca data 
new df['enrolled university'].value counts() / len(new df) 


l; 


axis=1) 


# add column names 
temp.columns = ['original', 'cca'] 


temp 


original cca 


no enrollment 0.721213 0.735188 


Full time course 0.196106 0.200733 


Part time course 0.062533 0.064079 


original 


temp = pd.concat ([ 
# percentage of observations per category, original data 
df['education level'].value counts() / len(df), 


l; 


cca 


# percentage of observations per category, cca data 
new df['education level'].value counts() / len (new df) 


axis=1) 


# add column names 


temp.columns 


temp 


Graduate 
Masters 
High School 
Phd 


Primary School 


= ['original', 


original 
0.605387 
0.227633 
0.105282 
0.021610 


0.016077 


cca 


0.619835 


0.234082 


0.107380 


0.022116 


0.016587 


"eea'] 


Handling Missing Categorical Data (frequent-value- 
imputation) 


import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 


df = pd.read csv('trainl.csv' usecols=['GarageQual','FireplaceQu','SalePrice']) 


df.head() 


FireplaceQu GarageQual SalePrice 


0 NaN TA 208500 
1 TA TA 181500 
2 TA TA 223500 
3 Gd TA 140000 
4 TA TA 250000 


df.isnull().mean()*100 


FireplaceQu 47.260274 
GarageQual 5.547945 
SalePrice 0.000000 


dtype: float64 


df['GarageQual'].value counts () .plot (kind='bar') 


<AxesSubplot:> 


1200 


1000 


df ['GarageQual'] .mode () 


0 TA 
dtype: object 


In [7]: | fig = plt.figure() 
ax = fig.add subplot(111) 


df[df['GarageQual']=='TA']['SalePrice'].plot (kind='kde', ax=ax) 


df [df ['GarageQual'].isnull()]['SalePrice'].plot (kind='kde', ax=ax, color='red') 


lines, labels = ax.get legend handles labels() 
labels 2 ['Houses with TA', 'Houses with NA'] 
ax.legend(lines, labels, locz'best') 


plt.title('GarageQual') 


Text (0.5, 1.0, 'GarageQual') 


Out [7]: 
le-5 GarageQual 
—— Houses with TA 
— Houses with NA 
In [8]* | temp = df[df['GarageQual']=='TA'] ['SalePrice'] 
In [9] df['GarageQual'].fillna('TA', inplace=True) 
In [10]: df['GarageQual'].value counts () .plot (kind='bar') 
< > 
out[10]: AxesSubplot 
1400 
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In [11]: 


fig = plt.figure() 
ax = fig.add subplot(111) 


temp.plot (kind='kde', ax=ax) 


# distribution of the variable after imputation 


df[df['GarageQual'] == 'TA']['SalePrice'].plot (kind='kde', ax=ax, color='red') 
lines, labels = ax.get legend handles labels() 
labels = ['Original variable', 'Imputed variable'] 


ax.legend(lines, labels, loc='best') 


# add title 
plt.title('GarageQual') 


Text(0.5, 1.0, 'GarageQual') 


Out [11]: 
1e-6 GarageQual 
—— Original variable 
—— Imputed variable 
E 
n 
5 
à 
In [12]: df['FireplaceQu'].value counts () .plot (kind='bar') 
< de 
Out[12]: AxesSubplot 
hj Få 
In [13]: df['FireplaceQu'].mode() 
0 Gd 
Out [13]: 
HELLS] dtype: object 
In [14]: 


fig = plt.figure() 
ax = fig.add subplot (111) 


df[df['FireplaceQu']=='Gd']['SalePrice'].plot (kind='kde', ax=ax) 


df[df['FireplaceQu'].isnull()]['SalePrice'].plot (kind='kde', ax=ax, color='red') 


lines, labels = ax.get legend handles labels() 
labels = ['Houses with Gd', 'Houses with NA'] 
ax.legend(lines, labels, locz'best') 


plt.title('FireplaceQu') 


Text (0.5, 1.0, 'FireplaceQu') 


Out[14]: 
le-5 FireplaceQu 
— Houses with Gd 
—— Houses with NA 
-200000 0 200000 400000 600000 800000 
In [15]: temp = df [df['FireplaceQu']=='Gd'] ['SalePrice'] 


In 116131 artebisasisssmuti.filina(t'6d', inplace=True) 


In [17]: df['FireplaceQu'].value counts () .plot (kind='bar') 


= ie 
Out[17]: AxesSubplot 


1000 
800 
600 
400 


200 


In [18]: | sig = plt.figure() 


ax = fig.add subplot (111) 
temp.plot (kind='kde', ax=ax) 


# distribution of the variable after imputation 
df[df['FireplaceQu'] == 'Gd']['SalePrice'] .plot (kind='kde', ax=ax, color='red') 


lines, labels = ax.get legend handles labels() 
labels = ['Original variable', 'Imputed variable'] 
ax.legend(lines, labels, locz'best') 


# add title 
plt.title('FireplaceQu') 


Text (0.5, 1.0, 'FireplaceQu') 


out [18]: 
le-6 FireplaceQu 
—— Original variable 
—— Imputed variable 
-200000 0 200000 400000 600000 800000 
In 149] from sklearn.model selection import train test split 
X train,X test,y train, y test = train test split (df.drop (columns=['SalePrice']),df['SaleP: 
A PENE 1 
Tn [29] from sklearn.impute import SimpleImputer 
. TER 
In [21]: imputer = Simplelmputer (strategy='most frequent") 
In [22]: X train = imputer.fit transform(X train) 
X test = imputer.transform(X train) 
. 527. 
In [23]: imputer.statistics | 
m array(['Gd', 'TA'], dtype-object) 
Out [23]: 


In [ ]: 


Missing Indicator (automatically-select-imputer- 


parameters) 


import numpy as np 
import pandas as pd 


from sklearn.model selection import train test split, GridSearchCV 
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline 
from sklearn.impute import Simplelmputer 
from sklearn.preprocessing import StandardScaler, OneHotEncoder 
from sklearn.linear model import LogisticRegression 
df = pd.read csv('train.csv') 
df.head() 
Passengerld Survived  Pclass Name Sex Age SibSp Parch Ticket Fare 
Braund, Mr. A/5 
0 1 0 3 Owen Harris male 22.0 1 0 21171 7.2500 
Cumings, 
Mrs. John 
1 2 1 1 Bradley female 38.0 1 0 PC17599 71.2833 
(Florence 
Briggs Th... 
Heikkinen, STON/O2. 
2 3 1 3 Miss. Laina female 26.0 0 3101282 7.9250 
Futrelle, 
Mrs. 
3 4 1 1 Jacques female 35.0 1 0 113803 53.1000 
Heath (Lily 
May Peel) 
Allen, Mr. 
4 5 0 3 William male 35.0 0 0 373450 8.0500 
Henry 
df .drop(columns=['Passengerld','Name','Ticket','Cabin'],inplace=True) 


#devide the columns 


X = df .drop(columns=['Survived']) 


y = df['Survived'] 


X train,X test,y train, y test = 


X train.head() 


Cabin Embarked 
NaN 5 
C85 C 
NaN $ 
C123 S 
NaN $ 


train test split(X,y,test size=0.2,random state=2) 


Pclass Sex Age SibSp Parch Fare Embarked 


30 1 male 40.0 0 0 27.7208 E 
10 3 female 40 1 1 16.7000 S 
873 3 male 47.0 0 0 9.0000 S 
182 3 male 90 4 2 31.3875 S 
876 3 male 20.0 0 0 9.8458 S 


y train.head() 


30 0 
10 1 
873 0 
182 0 
876 0 


Name: Survived, dtype: int64 


numerical features = ['Age', 'Fare'] 
numerical transformer = Pipeline (steps-[ 
('imputer', SimpleImputer (strategy='median')), 


('scaler', StandardScaler()) 


1) 


categorical features = ['Embarked', 'Sex'] 
categorical transformer = Pipeline (steps-[ 
('imputer', SimpleImputer (strategy='most frequent')), 


('ohe',OneHotEncoder (handle unknownz'ignore')) 


1) 


preprocessor - ColumnTransformer( 

transformers=[ 
('num', numerical transformer, numerical features), 

('cat', categorical transformer, categorical features) 


clf 


= Pipeline (steps-[ 
('preprocessor', preprocessor), 
('classifier', LogisticRegression()) 


1) 


from sklearn import set config 


set config (display='diagram' ) 
clf 


preprocessor: 


ColumnTransformer 


iLogisticRegression | 
BE ee Heier DENN, 


param grid = ( 
'preprocessor num imputer strategy': ['mean', 'median'], 


"preprocessor cat  Imputer strategy": 


tales sites. OMS 


ER ee 


ul; 250, 10, 100] 


grid search = GridSearchCV(clf, param grid, cv=10) 


grid search.fit(X train, y train) 
print(f"Best params:") 

print(grid search.best params ) 

Best params: 

('classifier C': 1.0, 

sor num imputer strategy": 'mean') 


Internal CV score: 


print(f"Internal CV score: 


import pandas as pd 


cv results 


cv results = cv resul 
cv results[['param cl 


0.788 


pd.DataFrame(grid search.cv results ) 


lts.sort values ("mean test score", 


lassifier  C','param preprocessor — 


['most frequent', 'constant'], 


'preprocessor cat  imputer  strategy': 'most frequent', 'preproces 


(grid search.best score :.3f)") 


ascending=False) 
cat imputer strategy", 'param prep 


param classifier C param preprocessor cat imputer strategy param preprocessor num imputer strategy mei 


1.0 


1.0 


1.0 


1.0 


10 


10 


10 


10 


100 


100 


100 


most frequent 
most frequent 
constant 
constant 
most frequent 
most frequent 
constant 
constant 
most frequent 
most frequent 


constant 


mean 
median 
mean 
median 
mean 
median 
mean 
median 
mean 
median 


mean 


15 


param classifier C param preprocessor cat imputer strategy 


100 


0.1 


0.1 


0.1 


0.1 


constant 
most frequent 
most frequent 
constant 


constant 


param preprocessor num imputer strategy 
median 

mean 

median 

mean 


median 


me: 


Imputing Numerical Data (Man Median Imputation) 


import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 


from sklearn.model selection import train test split 
from sklearn.impute import Simplelmputer 
from sklearn.compose import ColumnTransformer 


df = pd.read csv('titanic toy.csv') 


df.head() 

Age Fare Family Survived 
0 220 7.2500 1 0 
1 38.0 71.2833 1 1 
2 260 7.9250 0 1 
3 35.0 53.1000 1 1 
4 350 8.0500 0 0 
df.info() 


«class 'pandas.core.frame.DataFrame'> 
RangeIndex: 891 entries, 0 to 890 
Data columns (total 4 columns): 


# Column Non-Null Count Dtype 
0 Age 714 non-nu float64 
1 Fare 846 non-nu float64 
2 Family 891 non-nu int64 
3 Survived 891 non-nu int64 
dtypes: float64(2), int64(2) 
memory usage: 28.0 KB 


df.isnull().mean() 


Age 0.198653 
Fare 0.050505 
Family 0.000000 


Survived 0.000000 
dtype: float64 


x 
l 


= df.drop(columns=['Survived']) 
= df['Survived'] 


kK 
I 


X train,X test,y train,y test = train test split(X,y,test size=0.2,random state=2) 


X train.shape, X test.shape 


(0712, 3), (179, 3)) 


X train.isnull().mean() 
Age 0.207865 
Fare 0.050562 


Family 0.000000 
dtype: float64 


mean age = X train['Age'].mean() 
median age = X train['Age'].median() 


mean fare = X train['Fare'].mean() 

median fare = X train['Fare'].median() 

X train['Age median'] = X train['Age'].fillna (median age) 

X train['Age mean'] = X train['Age'].fillna (mean age) 

X train['Fare median'] = X train['Fare'].fillna (median fare) 
X train['Fare mean'] = X train['Fare'].fillna (mean fare) 


C:\Users\HP\AppData\Local\Temp/ipykernel 69128/2444989457.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame. 
Try using .loc[row indexer,col indexer] - value instead 


See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user gu 
ide/indexing.html#returning-a-view-versus-a-copy 

X train['Age median'] = X train['Age'].fillna (median age) 
C:\Users\HP\AppData\Local\Temp/ipykernel 69128/2444989457.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame. 
Try using .loc[row indexer,col indexer] - value instead 


See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user gu 
ide/indexing.html#returning-a-view-versus-a-copy 

X train['Age mean'] = X train['Age'].fillna (mean age) 
C:\Users\HP\AppData\Local\Temp/ipykernel 69128/2444989457.py:4: SettingWithCopyWarning: 


A value is trying to be set on a copy of a slice from a DataFrame. 
Try using .loc[row indexer,col indexer] - value instead 


See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user gu 
ide/indexing.html#returning-a-view-versus-a-copy 

X train['Fare median'] = X train['Fare'].fillna(median fare) 
C:\Users\HP\AppData\Local\Temp/ipykernel 69128/2444989457.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame. 
Try using .loc[row indexer,col indexer] - value instead 


See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user gu 
ide/indexing.html#returning-a-view-versus-a-copy 
X train['Fare mean'] = X train['Fare'].fillna (mean fare) 


X train.sample (5) 


Age Fare Family Age median Age mean Fare median Fare mean 
154 NaN 7.3125 0 28.75 29.785904 7.3125 7.3125 
580 25.0 30.0000 2 25.00 25.000000 30.0000 30.0000 


747 30.0 13.0000 0 30.00 30.000000 13.0000 13.0000 


Age Fare Family Age median Age mean Fare median Fare mean 


292 360 12.8750 0 36.00 36.000000 12.8750 12.8750 

435 14.0 120.0000 3 14.00 | 14.000000 120.0000 120.0000 

print('Original Age variable variance: ', X train['Age'].var()) 

print('Age Variance after median imputation: ', X train['Age median'].var()) 
print('Age Variance after mean imputation: ', X train['Age mean'].var()) 
print('Original Fare variable variance: ', X train['Fare'].var()) 

print('Fare Variance after median imputation: ', X train['Fare median'].var()) 
print('Fare Variance after mean imputation: ', X train['Fare mean'].var()) 


Original Age variable variance:  204.3495133904614 

Age Variance after median imputation: 161.9895663346054 
Age Variance after mean imputation:  161.81262452718673 
Original Fare variable variance: 2448.197913706318 

Fare Variance after median imputation:  2340.0910219753637 
Fare Variance after mean imputation:  2324.2385256705547 


fig = plt.figure() 
ax = fig.add subplot (111) 


# original variable distribution 
X train['Age'].plot (kind='kde', ax=ax) 


# variable imputed with the median 
X train['Age median'].plot(kind2'kde', ax=ax, color='red') 


# variable imputed with the mean 
X train['Age mean'].plot (kind='kde', ax=ax, color='green') 


# add legends 
lines, labels = ax.get legend handles labels() 
ax.legend(lines, labels, loc='best') 


«matplotlib.legend.Legend at 0x281a01a5850> 


0.05 

— Age 

— Age median 
0.04 — Age mean 


Density 


-40 -20 0 20 40 60 80 100 
fig = plt.figure() 
ax = fig.add subplot(111) 


# original variable distribution 
X train['Fare'].plot(kind='kde', ax=ax) 


# variable imputed with the median 


X train 


'Fare median'].plot (kind='kde', 


# variable imputed with the mean 


X train 


# add legends 


lines, 


abels = 


ax.legend (1 


ines, 


labels, 


'Fare mean'].plot(kindz'kde', 


ax=ax, 


ax=ax, 


ax.get legend handles labels() 
loc='best') 


«matplotlib.legend.Legend at 0x281a09eff10» 
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Density 


0.005 
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-200 


X train.cov() 


Age 

Fare 

Family 

Age median 
Age mean 
Fare median 


Fare mean 


Age 
204.349513 
70.719262 
-6.498901 
204.349513 
204.349513 
64.858859 


66.665205 


X train.corr() 


Age 

Fare 

Family 

Age median 
Age mean 
Fare median 


Fare mean 


Age 
1.000000 
0.092644 

-0.299113 
1.000000 
1.000000 
0.087356 


0.090156 


200 


Fare 


70.719262 


2448.197914 


17.258917 


57.957599 


55.603719 


2448.197914 


2448.197914 


Fare 


0.092644 


1.000000 


0.208268 


0.091757 


0.088069 


1.000000 


1.000000 


Family 


— Fare 
——- Fare median 
—— Fare mean 


400 


Family 
-6.498901 
17.258917 

2.735252 
-5.112563 
-5.146106 
16.476305 


16.385048 


-0.299113 


0.208268 


1.000000 


-0.242883 


-0.244610 


0.205942 


0.205499 


Age median 


600 


Age median 
204.349513 
57.957599 
-5.112563 
161.989566 
161.812625 
53.553455 


55.023037 


1.000000 
0.091757 
-0.242883 
1.000000 
0.999454 
0.086982 


0.089673 


800 


Age mean 
204.349513 
55.603719 
-5.146106 
161.812625 
161.812625 
51.358000 


52.788341 


Age mean Fare median 
1.000000 
0.088069 

-0.244610 
0.999454 
1.000000 
0.083461 


0.086078 


colorz'red') 


color='green') 


Fare_median 
64.858859 
2448.197914 
16.476305 
53.553455 
51.358000 
2340.091022 


2324.238526 


0.087356 
1.000000 
0.205942 
0.086982 
0.083461 
1.000000 


0.996607 


Fare_mean 
66.665205 
2448.197914 
16.385048 
55.023037 
52.788341 
2324.238526 


2324.238526 


Fare mean 
0.090156 
1.000000 
0.205499 
0.089673 
0.086078 
0.996607 


1.000000 


X train[['Age', 'Age median', 'Age mean']].boxplot() 


<AxesSubplot:> 
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Age Age_median Age mean 


X train[['Fare', 'Fare median', 'Fare mean']].boxplot() 


<AxesSubplot:> 


Fare Fare_median Fare_mean 


Using Sklearn 


X train,X test,y train, y test = train test split (X, y, test_size=0.2,random state=2) 


imputerl = SimpleImputer (strategy='median') 
imputer2 = SimpleImputer (strategy='mean') 
Erf ColumnTransformer([ 


('imputerl',imputerl,['Age']), 
('imputer2',imputer2,['Fare']) 
],remainderz'passthrough') 


trf.fit(X train) 


ColumnTransformer (remainder='passthrough', 


transformers=[('imputerl', SimpleImputer(strategy-'median'), 
['Age']), 
('imputer2', SimpleImputer(), ['Fare'])]) 


trf.named transformers ['imputerl'].statistics 


array([28.75]) 


trf.named transformers ['imputer2'].statistics 


array([32.61759689]) 


X train = trf.transform(X train) 
X test = trf.transform(X test) 


X train 
array([[ 40. ; 2441208, 0. 4 
4 gå 6s , 2 r 
47 ; 9. ; 0 r 
FL 7 49.5042, 0. + 
28.75 , 221.7792, 0. ; 
204759. 254925: ; Qu 1) 


Imputing Numerical Data (Arbitrary-Value- 
Imputation) 


import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 


from sklearn.model selection import train test split 
from sklearn.impute import Simplelmputer 
from sklearn.compose import ColumnTransformer 


df = pd.read csv('titanic toy.csv') 


df.head() 

Age Fare Family Survived 
0 220 7.2500 1 0 
1 380 71.2833 1 1 
2 260 7.9250 0 1 
3 35.0 53.1000 1 1 
4 35.0 8.0500 0 0 


df.isnull().mean() 


Age 0.198653 
Fare 0.050505 
Family 0.000000 


Survived 0.000000 
dtype: float64 


x 
| 


= df.drop(columns=['Survived']) 
y = df['Survived'] 


X train,X test, y train, y test = train test split (X, y, test_size=0.2,random state=2) 


X train['Age 99'] = X train['Age'].fillna(99) 

X train['Age minusl'] = X train['Age'].fillna(-1) 

X train['Fare 999'] = X train['Fare'].fillna(999) 

X train['Fare minusl'] = X train['Fare'].fillna(-1) 


C:\Users\HP\AppData\Local\Temp/ipykernel 79272/3652012184.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame. 
Try using .loc[row indexer,col indexer] - value instead 


See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user gu 
ide/indexing.html#returning-a-view-versus-a-copy 


X train['Age 99'] = 


C:\Users\HP\AppData\Local\Temp/ipykernel 79272/3652012184.py:2: SettingWithCopyWarning: 


X train['Age'].fillna (99) 


A value is trying to be set on a copy of a slice from a DataFrame. 


Try using .loc[row indexer,col indexer] - value instead 


See the caveats in the documentation: 


ide/indexing.html#returni 
= '] 
C:\Users\HP\AppData\Local 


X train['Age minusl 


https://pandas.pydata.org/pandas-docs/stable/user gu 


ing-a-view-versus-a-copy 
- X train['Age'].fillna(-1) 
INTemp/ipykernel 79272/3652012184.py:4: SettingWithCopyWarning: 


A value is trying to be set on a copy of a slice from a DataFrame. 


E 


X train['Fare 999'] 


Pry using .loc[row indexer,col indexer] - value instead 


See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user gu 
ide/indexing.html#returning-a-view-versus-a-copy 
X train['Fare' 


].fillna(999) 


C:\Users\HP\AppData\Local\Temp/ipykernel 79272/3652012184.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame. 
Try using .loc[row indexer,col indexer] - value instead 


See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user gu 
ide/indexing.html#returning-a-view-versus-a-copy 


X train['Fare minusl'] = X train['Fare'].fillna(-1) 
print('Original Age variable variance: ', X train['Age'].var()) 
print('Age Variance after 99 wala imputation: ', X train['Age 99'].var()) 
print('Age Variance after -1 wala imputation: ', X train['Age minusl'].var()) 
print('Original Fare variable variance: ', X train['Fare'].var()) 
print('Fare Variance after 999 wala imputation: ', X train["Fåré 999'].var()) 
print('Fare Variance after =L wala imputation: ', X train['Fare minusl'].var()) 


Original Age variable 
Age Variance after 99 


variance: 


Wa 


la imputation: 


Age Variance after -1 wala imputation: 


Original Fare variable variance: 
Fare Variance after 999 wala impu 


204.3495133904614 


951.7275570187172 
318.0896202624484 


2448.197913706318 
tation:  47219.20265217623 


Fare Variance after -1 wala imputation: 2378.5676784883503 


fig = plt.figure() 


ax = fig.add subplot(111) 


# original variable distribution 


X train['Age'].plot(kindz'kde', 


# variable imputed with the median 


X train['Age 99'].plot(kind="kde", 


# variable imputed with the mean 


X train['Age minusl'] 


# add legends 


lines, labels = ax.get . 


.plot(kindz'kde', 


ax.legend(lines, labels, 


loc='best') 


ax=ax) 


ax=ax, color='red') 


ax=ax, color='green') 


legend handles labels () 


<matplotlib.legend.Legend at 0x28930f0acd0> 
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fig = plt.figure() 
.add subplot (111) 


ax = fig 


75 100 


# original variable distribution 


X train 


'Fare'].pl 


ot(kindz'kde', 


# variable imputed with the median 
"Fare 999'].plot (kind='kde', 


X train 


# variable imputed with the mean 


X train 


"Fare minusl'].plot(kind="kde", 


# add legends 


lines, 


labels = 
ax.legend(lines, 


labels, 


ax=ax) 


ax=ax, 


ax=ax, 


ax.get legend handles labels () 
loc='best') 


<matplotlib.legend.Legend at 0x28931765760> 


-500 


-250 


X train.cov() 


Age 
Fare 
Family 
Age 99 


Age minus1 


Age 
204.349513 
70.719262 
-6.498901 
204.349513 


204.349513 


0 


250 


Fare 


70.719262 


2448.197914 


17.258917 


-101.671097 


125.558364 


500 


750 


Family 
-6.498901 
17.258917 

2.735252 
-7.387287 


-4.149246 


—— Fare 
—— Fare 999 


150 


colorz'red') 


—— Fare minusl 


Age 99 
204.349513 
-101.671097 
-7.387287 
951.727557 


-189.535540 


1000 1250 1500 


Age_minus1 
204.349513 
125.558364 

-4,149246 
-189.535540 


318.089620 


color='green') 


Fare_999 
162.793430 
2448.197914 
11.528625 
-159.931663 


257.379887 


Fare_minus1 
63.321188 
2448.197914 
16.553989 
-94.317400 


114.394141 


Age Fare Family Age 99 Age minus) Fare 999 Fare minus1 
Fare 999 162.793430 2448.197914 11.528625 -159.931663 257.379887 47219.202652 762.474982 


Fare minus1 63.321188 2448.197914 16.553989 -94.317400 114.394141 762.474982 2378.567678 


X train.corr() 


Age Fare Family Age 99 Age minus1 Fare 999 Fare minus1 

Age 1.000000 0.092644 -0.299113 1.000000 1.000000 0.051179 0.084585 
Fare 0.092644 1.000000 0.208268 -0.066273 0.142022 1.000000 1.000000 
Family -0.299113 0.208268 1.000000 -0.144787 -0.140668 0.032079 0.205233 


Age 99 1.000000 -0.066273 -0.144787 1.000000 -0.344476 -0.023857 -0.062687 


Age_minus1 1.000000 0.142022 -0.140668 -0.344476 1.000000 0.066411 0.131514 
Fare_999 0.051179 1.000000 0.032079 -0.023857 0.066411 1.000000 0.071946 
Fare minus1 0.084585 1.000000 0.205233 -0.062687 0.131514 0.071946 1.000000 


Using Sklearn 


X train,X test,y train, y test = train test split(X,y,test size=0.2,random state=2) 


imputerl = SimpleImputer(strategy-'constant',fill value=99) 
imputer2 = SimpleImputer(strategy-'constant',fill value=999) 
trf ColumnTransformer([ 


('imputerl',imputerl,['Age']), 
('imputer2',imputer2,['Fare']) 
],remainderz'passthrough') 


trf.fit(X train) 


ColumnTransformer (remainder='passthrough', 
transformers=[ ('imputerl', 
SimpleImputer (fill value=99, 
strategy-'constant'), 


['Age']), 

('imputer2', 

Simplelmputer(fill value=999, 
strategy-'constant'), 


['Fare'])]) 
trf.named transformers ['imputerl'].statistics 
array([99.]) 
trf.named transformers ['imputer2'].statistics 


array([999.]) 


X train 
X test 


X train 


array([ 


= trf.transform(X train) 
trf.transform(X test) 


40 , 27.7208, 0 
4 , 16.7 i 2 
47 i 94 F 0 
71 , 49.5042, 0 
99 y 225117923 0 


Random Sample Imputation 


import numpy as np 
import pandas as pd 


from sklearn.model selection import train test split 


import matplotlib.pyplot as plt 
import seaborn as sns 


df = pd.read_csv('train.csv',usecols=['Age','Fare', 'Survived']) 


df.head() 
Survived Age Fare 
0 0 220 7.2500 
1 1 380 71.2833 
2 1 260 7.9250 
3 1 35.0 53.1000 
4 0 35.0 8.0500 


df.isnull().mean() * 100 


Survived 0.00000 
Age 19.86532 
Fare 0.00000 


dtype: float64 


x 
l 


= df.drop(columns=['Survived']) 
y = df['Survived'] 


X train,X test,y train, y test = train test split(X,y,test size=0.2,random state=2) 


X train 


Age Fare 

30 40.0 27.7208 
10 4.0 16.7000 
873 47.0 9.0000 
182 9.0 31.3875 


876 20.0 9.8458 


534 30.0 8.6625 


Age Fare 
584 NaN 8.7125 
493 71.0 49.5042 
527 NaN 221.7792 


168 NaN 25.9250 

712 rows x 2 columns 

X train['Age imputed'] = X train['Age'] 
X test['Age imputed'] = X test['Age'] 


C:\Users\HP\AppData\Local\Temp/ipykernel 80384/1230362693.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame. 
Try using .loc[row indexer,col indexer] - value instead 


See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user gu 
ide/indexing.html#returning-a-view-versus-a-copy 
X train['Age imputed'] = X train['Age'] 
C:\Users\HP\AppData\Local\Temp/ipykernel 80384/1230362693.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame. 
Try using .loc[row indexer,col indexer] - value instead 


See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user gu 
ide/indexing.html#returning-a-view-versus-a-copy 
X test['Age imputed'] = X test['Age'] 


X test.tail() 


Age Fare Age imputed 


89 240 8.0500 24.0 
80 22.0 9.0000 22.0 
846 NaN 69.5500 NaN 
870 260 7.8958 26.0 
251 29.0 10.4625 29.0 
X train['Age imputed'][X train['Age imputed'].isnull()] = X train['Age'].dropna().sample Q 
X test['Age imputed'][X test['Age imputed'].isnull()] = X train['Age'].dropna().sample(X t 


C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py:8870: SettingWithCopyWar 
ning: 
A value is trying to be set on a copy of a slice from a DataFrame 


See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user gu 
ide/indexing.html#returning-a-view-versus-a-copy 
return self. update inplace (result) 


X train['Age'].dropna().sample(X train['Age'].isnull().sum()).values 


árray([54. q 50. + 30. y 59. cp 40. , 22. yp Ae p 54. y 4132. > 
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X train['Age'].isnull().sum() 


148 


X train 


Age Fare Age imputed 


30 40.0 27.7208 
10 4.0 16.7000 
873 470 9.0000 
182 9.0 31.875 


876 20.0 9.8458 


534 30.0 8.6625 
584 NaN 8.7125 
493 71.0 49.5042 
527 NaN 221.7792 


168 NaN 25.9250 


712 rows x 3 columns 
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4.0 
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sns.distplot (X_train['Age'],label='Original',hist=False) 
sns.distplot(X train['Age imputed'],label = 'Imputed',hist=False) 


plt.legend() 
plt.show() 


C:\ProgramData\Anaconda3\ 


distplot is a deprecated func 


our code to use either 


displo 


eplot' (an axes-level function 

warnings.warn(msg, FutureWar 
C:\ProgramData\Anaconda3\lib\s 
distplot is a deprecated func 


our code to use either 


“displo 


eplot' (an axes-level function 
warnings.warn(msg, FutureWar 


GE- GT 283 


ib\site-packages\seaborn\distributions.py:2619: 
ion and wi 


for kernel 
ing) 
te-packages\seaborn\distributions.py:2619: 
ion and wi 


111 be removed in a future version. 


FutureWarning: 


Please adapt y 


(a figure-level function with similar flexibility) or ‘kd 


| density plots). 


ill be removed in a future version. 


FutureWarning: 


Please adapt y 


| density plots). 


(a figure-level function with similar flexibility) or "kd 
for kernel 
ing) 


In [16] 
Out [16]: 
In [17]: 


Out 


[17]: 


[20]: 


[20]: 


0.030 —— Original 


—— Imputed 
0.025 
0.020 
E 
a 
5 0015 
a 
0.010 
0.005 
0.000 
0 20 40 60 80 
Age imputed 
print('Original variable variance: ', X train['Age'].var()) 
print('Variance after random imputation: ', X train['Age imputed'].var()) 


Original variable variance:  204.3495133904614 
Variance after random imputation:  208.142956065796 


X train[['Fare', 'Age', 'Age imputed']].cov() 


Fare Age Age imputed 
Fare 2368.246832 71.512440 58.923994 
Age 71.512440 204.349513 204.349513 


Age imputed 58.923994 204.349513 208.142956 


X train[['Age', 'Age imputed']].boxplot() 


<AxesSubplot:> 
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Age Age_imputed 


data = pd.read csv('house-train.csv',usecols-['GarageQual','FireplaceQu', 


data.head() 


FireplaceQu GarageQual SalePrice 


'SalePrice']) 


FireplaceQu GarageQual SalePrice 


0 NaN TA 208500 
1 TA TA 181500 
2 TA TA 223500 
3 Gd TA 140000 
4 TA TA 250000 


data.isnull().mean() * 100 


FireplaceQu 47.260274 
GarageQual 5.547945 
SalePrice 0.000000 
dtype: float64 


x 
l 


= data 
= data['SalePrice'] 


< 
I 


X train,X test,y train, y test = train test split(X,y,test size=0.2,random state=2) 


X train['GarageQual imputed'] = X train['GarageQual'] 

X test['GarageQual imputed'] = X test['GarageQual'] 

X train['FireplaceQu imputed'] = X train['FireplaceQu'] 
X test['FireplaceQu imputed'] = X test['FireplaceQu'] 


C:\Users\HP\AppData\Local\Temp/ipykernel 80384/3838090268.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame. 
Try using .loc[row indexer,col indexer] - value instead 


See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user gu 
ide/indexing.html#returning-a-view-versus-a-copy 

X train['GarageQual imputed'] = X train['GarageQual'] 
C:\Users\HP\AppData\Local\Temp/ipykernel 80384/3838090268.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame. 
Try using .loc[row indexer,col indexer] - value instead 


See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user gu 
ide/indexing.html#returning-a-view-versus-a-copy 

X test ['GarageQual imputed'] = X test ['GarageQual'] 
C:\Users\HP\AppData\Local\Temp/ipykernel 80384/3838090268.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame. 
Try using .loc[row indexer,col indexer] - value instead 


See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user gu 
ide/indexing.html#returning-a-view-versus-a-copy 

X train['FireplaceQu imputed'] = X train['FireplaceQu'] 
C:\Users\HP\AppData\Local\Temp/ipykernel 80384/3838090268.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame. 
Try using .loc[row indexer,col indexer] - value instead 


See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user gu 
ide/indexing.html#returning-a-view-versus-a-copy 
X test ['FireplaceQu imputed'] = X test['FireplaceQu'] 


X train.sample(5) 


FireplaceQu GarageQual SalePrice GarageQual imputed FireplaceQu imputed 


745 TA TA 299800 TA TA 

300 Gd TA 157000 TA Gd 

695 TA TA 176000 TA TA 

1170 Po TA 171000 TA Po 

1110 TA TA 188000 TA TA 

X train['GarageQual imputed'][X train['GarageQual imputed'].isnull()] = X train['GarageQu: 
X test['GarageQual imputed'][X test['GarageQual imputed'].isnull()] = X train['GarageQual' 
X train['FireplaceQu imputed'][X train['FireplaceQu imputed'].isnull()] = X train['Fireple 
X test['FireplaceQu imputed'][X test['FireplaceQu imputed'].isnull()] = X train['Fireplace 
C:\Users\HP\AppData\Local\Temp/ipykernel 80384/856878696.py:1: SettingWithCopyWarning: 


A value is trying to be set on a copy of a slice from a DataFrame 


See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user gu 
ide/indexing.html#returning-a-view-versus-a-copy 

X train['GarageQual imputed'][X train['GarageQual imputed'].isnull()] = X train['GarageQ 
ual'].dropna().sample(X train['GarageQual'].isnull().sum()).values 
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py:8870: SettingWithCopyWar 
ning: 
A value is trying to be set on a copy of a slice from a DataFrame 


See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user gu 
ide/indexing.html#returning-a-view-versus-a-copy 
return self. update inplace(result) 
C:\Users\HP\AppData\Local\Temp/ipykernel 80384/856878696.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame 


See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user gu 
ide/indexing.html#returning-a-view-versus-a-copy 

X test ['GarageQual imputed'][X test['GarageQual imputed'].isnull()] = X train['GarageQua 
l'].dropna().sample(X test['GarageQual'].isnull().sum()).values 
C:\Users\HP\AppData\Local\Temp/ipykernel 80384/856878696.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame 


See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user gu 

ide/indexing.html#returning-a-view-versus-a-copy 
X train['FireplaceQu imputed'][X train['FireplaceQu imputed'].isnull()] = X train['Firep 

laceQu'].dropna().sample(X train['FireplaceQu'].isnull().sum()).values 

C:\Users\HP\AppData\Local\Temp/ipykernel 80384/856878696.py:5: SettingWithCopyWarning: 

A value is trying to be set on a copy of a slice from a DataFrame 


See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user gu 
ide/indexing.html#returning-a-view-versus-a-copy 

X test ['FireplaceQu imputed'][X test['FireplaceQu imputed'].isnull()] = X train['Firepla 
ceQu'].dropna().sample(X test['FireplaceQu'].isnull().sum()).values 


temp = pd.concat( 
[ 
X train['GarageQual'].value counts() / len(X train['GarageQual'].dropna()), 
X train['GarageQual imputed'].value counts() / len(X train) 


l; 


axis=1) 


temp.columns = ['original', 'imputed'] 


temp 


original imputed 
TA 0.951043 0.952055 
Fa 0.037171 0.035959 
Gd 0.009973 0.010274 
Po 0.000907 0.000856 
Ex 0.000907 0.000856 
temp = pd.concat( 
[ 
X train['FireplaceQu'].value counts() / len(X train['FireplaceQu'].dropna()), 
X train['FireplaceQu imputed'].value counts() / len(df) 
l, 
axisz1) 
temp.columns = ['original', 'imputed'] 
temp 
original imputed 
Gd 0494272 0.647587 
TA 0.412439 0.543210 
Fa 0.040917 0.052750 
Po 0.027823 0.035915 
Ex 0.024550 0.031425 
for category in X train['FireplaceQu'].dropna().unique(): 
sns.distplot(X train[X train['FireplaceQu'] == category] ['SalePrice'],hist=False, label 
plt.show() 
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either 'displot” (a figure-level function with similar flexibility) or ‘kd 
eplot' (an axes-level function for kernel density plots). 
warnings.warn(msg, FutureWarning) 
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either '"displot' (a figure-level function with similar flexibility) or ‘kd 
eplot' (an axes-level function for kernel density plots). 
warnings.warn(msg, FutureWarning) 
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either 'displot” (a figure-level function with similar flexibility) or ‘kd 
eplot' (an axes-level function for kernel density plots). 
warnings.warn(msg, FutureWarning) 
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either '"displot' (a figure-level function with similar flexibility) or ‘kd 
eplot' (an axes-level function for kernel density plots). 
warnings.warn(msg, FutureWarning) 
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 


our code to use either '"displot' (a figure-level function with similar flexibility) or ‘kd 
eplot' (an axes-level function for kernel density plots). 
warnings.warn(msg, FutureWarning) 
le-5 
SalePrice 1e6 
for category in X train['FireplaceQu imputed'].dropna().unique(): 
sns.distplot(X train[X train['FireplaceQu imputed'] == category] ['SalePrice'],hist=Fal 
plt.show() 
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 


distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either '"displot' (a figure-level function with similar flexibility) or ‘kd 
eplot' (an axes-level function for kernel density plots). 

warnings.warn(msg, FutureWarning) 
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 


distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either '"displot' (a figure-level function with similar flexibility) or ‘kd 
eplot' (an axes-level function for kernel density plots). 

warnings.warn(msg, FutureWarning) 
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 


distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either '"displot' (a figure-level function with similar flexibility) or ‘kd 
eplot' (an axes-level function for kernel density plots). 

warnings.warn(msg, FutureWarning) 
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 


distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either '"displot' (a figure-level function with similar flexibility) or ‘kd 
eplot' (an axes-level function for kernel density plots). 

warnings.warn(msg, FutureWarning) 
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 


distplot' is a deprecated function and will be removed in a future version. 
our code to use either 'displot” (a figure-level function with similar flexibility) 
eplot' (an axes-level function for kernel density plots). 

warnings.warn(msg, FutureWarning) 


Please adapt y 
or ‘kd 


00 02 04 06 08 10 
SalePrice le6 


In [ ]: 


KNN Imputer 


import numpy as np 
import pandas as pd 


from sklearn.model selection import train test split 


from sklearn.impute import KNNImputer,SimpleImputer 
from sklearn.linear model import LogisticRegression 


from sklearn.metrics import accuracy score 


df = pd.read csv('train.csv')[['Age','Pclass', "Fare", 'Survived']] 
df.head() 
Age Pclass Fare Survived 
0 22.0 3 7.2500 0 
1 380 1 71.2833 1 
2 260 3 7.9250 1 
3 350 1 53.1000 1 
4 350 3 8.0500 0 


df.isnull().mean() * 100 


Age 19.86532 
Pclass 0.00000 
Fare 0.00000 
Survived 0.00000 


dtype: float64 


x 
| 


= df.drop(columns=['Survived']) 
= df['Survived'] 


< 
I 


X train,X test,y train, y test = train test split(X,y,test size=0.2,random state=2) 


X_train.head() 


Age Pclass Fare 

30 40.0 1 27.7208 
10 40 3 16.7000 
873 47.0 3 9.0000 
182 9.0 3 31.3875 


876 20.0 3 9.8458 


knn = KNNImputer (n_neighbors=3,weights='distance') 


X train trf = knn.fit transform(X train) 
X test trf = knn.transform(X test) 


lr = LogisticRegression() 


lr.fit(X train trf,y train) 


y pred = lr.predict(X test trf) 
accuracy score(y test,y pred) 


0.7150837988826816 


# Comparision with Simple Imputer --> mean 
si = SimpleImputer () 


X train trf2 = si.fit transform(X train) 
X test trf2 = si.transform(X test) 


lr = LogisticRegression() 


lr.fit(X train trf2,y train) 
y pred2 = lr.predict(X test trf2) 
accuracy score(y test,y pred2) 


0.6927374301675978 
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from sklearn.linear model import LinearRegression 


df = np.round(pd.read csv('50 Startups.csv')[['R&D Spend','Administration','Marketing Sper 
np.random.seed(9) 

df = df.sample(5) 

df 


R&D Spend Administration Marketing Spend Profit 


21 8.0 15.0 30.0 11.0 
37 4.0 5.0 20.0 9.0 

2 15.0 10.0 41.0 19.0 
14 12.0 16.0 26.0 13.0 
44 2.0 15.0 3.0 7.0 


df = df.iloc[:,0:-1] 


R&D Spend Administration Marketing Spend 


21 8.0 15.0 30.0 
37 4.0 5:0 20.0 
2 15.0 10.0 41.0 
14 12.0 16.0 26.0 
44 2.0 15.0 3.0 
df.iloc[1,0] = np.NaN 
df.iloc[3,1] = np.NaN 
df.iloc[-1,-1] = np.NaN 


C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py:1732: SettingWithCopyWa 
rning: 
A value is trying to be set on a copy of a slice from a DataFrame 


See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user gu 
ide/indexing.html#returning-a-view-versus-a-copy 

self. setitem single block(indexer, value, name) 
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py:723: SettingWithCopyWar 
ning: 
A value is trying to be set on a copy of a slice from a DataFrame 


See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user gu 
ide/indexing.html#returning-a-view-versus-a-copy 
iloc. setitem with indexer(indexer, value, self.name) 


df.head() 


R&D Spend Administration Marketing Spend 


21 8.0 15.0 30.0 
37 NaN 5.0 20.0 
2 15.0 10.0 41.0 


14 12.0 NaN 26.0 


R&D Spend Administration Marketing Spend 


44 2.0 15.0 NaN 


# Step 1 - Impute all missing values with mean of respective col 


df0 = pd.DataFrame () 


dfO['R&D Spend'] = df ['RáD Spend'].fillna(df['R&D Spend'].mean()) 
dfO['Administration'] = df['Administration'].fillna(df['Administration'].mean()) 
dfO['Marketing Spend'] = df['Marketing Spend'].fillna(df['Marketing Spend'].mean()) 


# Oth Iteration 
dfo 


R&D Spend Administration Marketing Spend 


21 8.00 15.00 30.00 
37 9.25 5.00 20.00 

2 15.00 10.00 41.00 
14 12.00 11.25 26.00 
44 2.00 15.00 29.25 


# Remove the coll imputed value 
dfl = df0.copy() 


dfl.iloc[1,0] = np.NaN 


df1 


R&D Spend Administration Marketing Spend 


21 8.0 15.00 30.00 
37 NaN 5.00 20.00 

2 15.0 10.00 41.00 
14 12.0 11.25 26.00 
44 2.0 15.00 29.25 


# Use first 3 rows to build a model and use the last for prediction 


X = dfl.ilocl [0,2,3,4],1:3] 


X 
Administration Marketing Spend 
21 15.00 30.00 
2 10.00 41.00 
14 11.25 26.00 


44 15.00 29.25 


y = dfl.iloc[[0,2,3,4],0] 
y 

21 8.0 

2 15.0 

14 12.0 

44 2.0 


Name: R&D Spend, dtype: float64 


lr = LinearRegression() 
lr.fit(X,y) 
lr.predict (dfl.iloc[1,1:].values.reshape (1,2)) 


array([23.14158651]) 


dfl.iloc[1,0] = 23.14 


df1 


R&D Spend Administration Marketing Spend 


21 8.00 15.00 30.00 
37 23.14 5.00 20.00 

2 15.00 10.00 41.00 
14 12.00 11.25 26.00 
44 2.00 15.00 29.25 


# Remove the col2 imputed value 
dfl.iloc[3,1] = np.NaN 


afl 


R&D Spend Administration Marketing Spend 


21 8.00 15.0 30.00 
37 23.14 5.0 20.00 
2 15.00 10.0 41.00 
14 12.00 NaN 26.00 
44 2.00 15.0 29.25 
# Use last 3 rows to build a model and use the first for prediction 


X = dfl.iloc[[0,1,2,4],[0,2]] 


R&D Spend Marketing Spend 
21 8.00 30.00 


37 23.14 20.00 


R&D Spend Marketing Spend 
2 15.00 41.00 


44 2.00 29.25 


y = dfl.iloc[[0,1,2,4],1] 
Y 

21 1550 

37 54,0 

2 10.0 

44 15.0 


Name: Administration, dtype: float64 


lr = LinearRegression() 
lr.fit(X,y) 
lr.predict (dfl.iloc[3,[0,2]].values.reshape (1,2) ) 


array([11.06331285]) 


dfl.iloc[3,1] = 11.06 


dfi 


R&D Spend Administration Marketing Spend 


21 8.00 15.00 30.00 
37 23.14 5.00 20.00 

2 15.00 10.00 41.00 
14 12.00 11.06 26.00 
44 2.00 15.00 29.25 


# Remove the col3 imputed value 
dfl.iloc[4,-1] = np.NaN 


afl 


R&D Spend Administration Marketing Spend 


21 8.00 15.00 30.0 
37 23.14 5.00 20.0 

2 15.00 10.00 41.0 
14 12.00 11.06 26.0 
44 2.00 15.00 NaN 


# Use last 3 rows to build a model and use the first for prediction 
X = dfl.iloc[0:4,0:2] 
X 


R&D Spend Administration 


21 8.00 15.00 
37 23.14 5.00 
2 15.00 10.00 
14 12.00 11.06 
y = dfl.iloc[0:4,-1] 

y 

21 30.0 

37 20.0 

2 41.0 

14 26.0 


Name: Marketing Spend, dtype: float64 


lr = LinearRegression() 
lr.fit(X,y) 
lr.predict(dfl.iloc[4,0:2].values.reshape(1,2)) 


array([31.56351448]) 


dfl.iloc[4,-1] = 31.56 


# After Ist Iteration 
afl 


R&D Spend Administration Marketing Spend 


21 8.00 15.00 30.00 
37 23.14 5.00 20.00 

2 15.00 10.00 41.00 
14 12.00 11.06 26.00 
44 2.00 15.00 31.56 


# Subtract Oth iteration from lst iteration 


dfl = df0 


R&D Spend Administration Marketing Spend 


21 0.00 0.00 0.00 
37 13.89 0.00 0.00 

2 0.00 0.00 0.00 
14 0.00 -0.19 0.00 
44 0.00 0.00 2.31 


df2 = dfl.copy() 


df2.iloc[1,0] = np.NaN 


df2 


R&D Spend Administration Marketing Spend 


21 8.0 15.00 30.00 
37 NaN 5.00 20.00 
2 15.0 10.00 41.00 
14 12.0 11.06 26.00 
44 2.0 15.00 31.56 
X e df2.i110c[[0,2,3,4],1:3] 


y = df2.iloc[[0,2,3,4]1,0] 


lr = LinearRegression() 
lr.fit(X,y) 
lr.predict (df2.iloc[1,1:].values.reshape (1,2) ) 


array ([23.78627207] ) 


df2.iloc[1,0] = 23.78 


df2.iloc[3,1] = np.NaN 
X = df2.i10c [0,1;2,4],[0,2]] 
y = df2.iloc[[0,1,2,4],1] 


lr = LinearRegression() 
lr.fit(X,y) 
lr.predict (df2.iloc[3,[0,2]].values.reshape (1,2)) 


array([11.22020174]) 


d£2.iloc[3,1] = 11.22 


df2.iloc[4,-1] = np.NaN 


X = df2.iloc[0:4,0:2] 
y = df2.iloc[0:4,-1] 


lr = LinearRegression() 
lr.fit(X, y) 
lr.predict (df2.iloc[4,0:2].values.reshape(1,2)) 


array ([38.87979054]) 


df2.1loc[4,-=1] = 31.56 


df2 


R&D Spend Administration Marketing Spend 


21 8.00 15.00 30.00 

37 23.78 5.00 20.00 
2 15.00 10.00 41.00 

14 12.00 11.22 26.00 

44 2.00 15.00 31.56 
df2 - dfl 


R&D Spend Administration Marketing Spend 


21 0.00 0.00 0.0 
37 0.64 0.00 0.0 

2 0.00 0.00 0.0 
14 0.00 0.16 0.0 
a4 0.00 0.00 0.0 


df3 = df2.copy() 
df3.iloc[1,0] = np.NaN 


df3 


R&D Spend Administration Marketing Spend 


21 8.0 15.00 30.00 

37 NaN 5.00 20.00 
2 15.0 10.00 41.00 

14 12.0 11.22 26.00 

44 2.0 15.00 31.56 
X = df3.iloc[[0,2,3,4],1:3] 


y = d£3.iloc[[0,2,3,4],0] 


lr = LinearRegression() 
lr.fit(X,y) 
lr.predict (df3.iloc[1,1:].values.reshape (1,2)) 


array([24.57698058]) 


df3.iloc[1,0] 2 24.57 


df3.iloc[3,1] = np.NaN 
X = df3.1100c[ [0,1,2,4],[0,2]] 
y = df3.iloc[[0,1,2,4],1] 


lr = LinearRegression() 


Le ETE Y) 
lr.predict (df3.iloc[3,[0,2]].values.reshape (1,2)) 


array ([11.37282844]) 


df3.iloc[3,1] = 11.37 


df3.iloc[4,-1] = np.NaN 


X = df3.iloc[0:4,0:2] 
y = df3.iloc[0:4,-1] 


lr = LinearRegression() 
lr.fit(X, y) 
lr.predict(df3.iloc[4,0:2].values.reshape(1,2)) 


array([45.53976417]) 


df3.iloc[4,-1] = 45.53 


df2.iloc[3,1] = 11.22 


df3 


R&D Spend Administration Marketing Spend 


21 8.00 15.00 30.00 

37 24.57 5.00 20.00 
2 15.00 10.00 41.00 

14 12.00 11:37 26.00 

44 2.00 15.00 45.53 
df3 - df2 


R&D Spend Administration Marketing Spend 


21 0.00 0.00 0.00 
37 0.79 0.00 0.00 

2 0.00 0.00 0.00 
14 0.00 0.15 0.00 


44 0.00 0.00 13.97 
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Outliers Detection Method 


l. Z-Score 
2. IOR 
3.Winsorization or 


Percentile 


The concept of outliers: 
What is it? 


Outliers are data points that are 
significantly different from the 
majority of the other data points 
in a dataset. In machine learning, 
they can have a significant 
impact on the results of a model 
if they are not detected and 
handled appropriately. Outliers 
can be due to measurement 
errors, errors in data collection, or 
they can be genuine examples 
that are not representative of the 
population. 


Z-Score 


ort numpy as np 

ort pandas as pd 

import matplotlib.pyplot as plt 
ort orn as sns 


df = pd.read csv('placement.csv') 


df.shape 
1 r 
out[3]; (1000, 3) 
In [4]: | ae sample (5) 
Out[4]: cgpa placement exam marks placed 
719 747 26.0 0 
457 6.58 20.0 0 
542 7.06 22.0 0 
733 7.07 10.0 0 
770 7.33 67.0 1 
In [5] plt.figure(figsize=(16,5)) 
plt.subplot(1,2,1) 
sns.distplot (df['cgpa']) 
plt.subplot(1,2,2) 
sns.distplot(df['placement exam marks']) 
plt.show() 
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either 'displot' (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 
warnings.warn(msg, FutureWarning) 
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either ‘displot' (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 
warnings.warn(msg, FutureWarning) 
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cgpa placement exam marks 
In [6]: df['placement exam marks'].skew() 


Out[6]: 0.8356419499466834 


Mea 
Std 
Min 
Max 


nt( 
int ( 
int ( 
HE 


Y 


value of cgpa 
value of cgpa 


4.89 
9412 


n value of cgpa 6.96124000000001 
value of cgpa 0.6158978751323894 


# Finding the boundary values 


pr 
pr 


Highest al 
Lowest all 


int ("Highest al 


llowed",df['cgpa'].mean() 


int ("Lowest all 


lowed",df['cgpa'].mean() 


* Finding the outliers 


8:80) | (df|'copa"] 


cgpa placement exam marks 


df£[(df['cgpa'] > 
485 492 

995 8.87 

996 9.12 

997 4.89 

999 4.90 
Trimming 


# Trimming 


44.0 


44.0 


65.0 


34.0 


10.0 


Llowed 8.808933625397177 
Lowed 5.113546374602842 


placed 
1 


1 


new df = df[(df['cgpa'] « 8.80) 
new df 


cgpa placement exam marks placed 


7.19 


7.46 


7.54 


6.42 


1:23 


7.04 


6.26 


6.73 


6.48 


8.62 


26.0 


38.0 


40.0 


8.0 


17.0 


57.0 


12.0 


21.0 


63.0 


46.0 


1 


1 


"Mean value of cgpa",df['cgpa'].mean()) 
"Std value of ogboa",df['ogpa'].stdQ0.) 
"Min value of cgpa",df['cgpa'].min()) 
'Max value of cgpa",df['cgpa'].max()) 


+ 3*df["egpa"].std()) 
= S*gf["'egpa']sstd() 


€. De LL) ] 


(df | "eqpå"] > 5.113] 


995 rows x 3 columns 


# Approach 2 

# Calculating the Zscore 

df['cgpa zscore'] = (df['cgpa'] - df['cgpa'].mean())/df['cgpa'].std() 
df.head() 


cgpa placement exam marks placed cgpa zscore 


0 7.19 26.0 1 0.371425 
1 746 38.0 1 0.809810 
2 7.54 40.0 1 0.939701 
3 642 8.0 1 -0.878782 
4 7.23 17.0 0 0.436371 


dfpdf["ogpa 2score"] > 3] 


cgpa placement exam marks placed cgpa zscore 
995 8.87 44.0 1 3.099150 


996 9.12 65.0 1 3.505062 


df[df['cgpa zscore'] « -3] 


cgpa placement exam marks placed cgpa zscore 


485 492 44.0 1 -3.314251 
997 489 34.0 0 -3.362960 
999 4.90 10.0 1 -3.346724 
df [(dt["egpa zscore'] > 3) | (df['egpa zscore'] € -3)] 


cgpa placement exam marks placed cgpa zscore 


485 492 44.0 1 -3.314251 
995 8.87 44.0 1 3.099150 
996 9.12 65.0 1 3.505062 
997 4.89 34.0 0 -3.362960 
999 4.90 10.0 1 -3.346724 


# Trimming 
new df = df[(df['cgpa zscore'] < 3) & (df['cgpa zscore'] > -3)] 


new df 


cgpa placement exam marks placed 


0 7.19 26.0 1 
1 746 38.0 1 
2 7.54 40.0 1 
3 642 8.0 1 
4 7.23 17.0 0 
991 7.04 57.0 0 
992 6.26 12.0 0 
993 6.73 21.0 1 
994 648 63.0 0 
998 8.62 46.0 1 


995 rows x 4 columns 


new df['cgpa'].describe() 


count 995.000000 
mean 6.963357 
std 0.600082 
min 5.230000 
25$ 6.550000 
50% 6.960000 
75% 7.365000 
max 8.620000 


Name: cgpa, dtype: float64 


Capping 


upper limit = df['cgpa'].mean() 
lower limit = df['cgpa'].mean() 


upper limit 


8.808933625397177 


lower limit 


5.113546374602842 


#Capping fun 
df['cgpa'] = np.where( 
df['cgpa']»upper limit, 
upper limit, 
np.where( 
df['cgpa']<lower limit, 
lower limit, 


cgpa zscore 


0.371425 


0.809810 


0.939701 


-0.878782 


0.436371 


0.127878 
-1.138565 
-0.375452 
-0.781363 


2.693239 


df['cgpa'] 


df.shape 


(1000, 4) 


df['cgpa'].describe() 


count 1000.000000 


mean 6.961499 
std 0.612688 
min 5.113546 
25$ 6.550000 
50% 6.960000 
75% 7.370000 
max 8.808934 


Name: cgpa, dtype: float64 


In [1]: 


In [2]: 


In [3]: 


out [3]: 


In [4]: 


IQR (Inter-quartile range) 


Interquartile Range 
(IQR) 
1.5 * IQR 1.5 * IQR 


Outliers Outliers 


Q1 Median Q3 
(25th percentile) (75th percentile) 


import numpy as np 

import pandas as pd 

import matplotlib.pyplot as plt 
import seaborn as sns 


df = pd.read csv('placement.csv') 


df.head() 


cgpa placement exam marks placed 


0 7419 26.0 1 
1 746 38.0 1 
2 7.54 40.0 1 
3 642 8.0 1 
4 723 17.0 0 


«figure (figsize=(16,5)) 
.subplot(1,2,1) 
sns.distplot (df['cgpa']) 


plt.subplot(1,2,2) 
sns.distplot(df['placement exam marks']) 


plt.show() 


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either 'displot' (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 

warnings.warn(msg, FutureWarning) 
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either '"displot' (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 

warnings.warn(msg, FutureWarning) 


In [6]: 


Out[6]: 


In [7]: 


cgpa placement exam marks 


df['placement exam marks'].describe() 


count 1000.000000 


mean 32.225000 
std 19.130822 
min 0.000000 
25% 17.000000 
50$ 28.000000 
75% 44.000000 
max 100.000000 


Name: placement exam marks, dtype: float64 
sns.boxplot(df['placement exam marks']) 


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\ decorators.py:36: FutureWarning: Pass 
the following variable as a keyword arg: x. From version 0.12, the only valid positional a 
rgument will be ‘data’, and passing other arguments without an explicit keyword will resul 
t in an error or misinterpretation. 

warnings.warn( 
«AxesSubplot:xlabel-'placement exam marks'» 


LARA LAS 


0 20 40 60 80 100 
placement_exam_marks 


# Finding the IQR 
percentile25 = df[' 
percentile75 = df[' 


Lacement exam marks'].quantile (0.25) 
Lacement exam marks'].quantile (0.75) 


pl 
pl 


percentile75 


igr = percentile75 - percentile25 


igr 


upper limit 
lower limit 


pr 


pri 


int ("Upper 


nt ("Lower 


percentil 
percentil 


imi 


imi 


t", upper . 
t", Lower | 


le75 + 1.5 * igr 
le25 - 1.5 * igr 


imit) 


imit) 


Upper limit 84 


Lower 


.5 
limit -23.5 


Finding Outliers 


df[df['placement exam marks'] » upper limit] 


134 


162 


283 


290 


311 


324 


630 


685 


730 


771 


846 


917 


cgpa placement exam marks placed 


1.15 


6.60 


7.51 


6.33 


7.80 


7.09 


8.38 


6.97 


6.64 


6.56 


6.05 


6.14 


7.31 


6.99 


5.95 


94.0 


86.0 


86.0 


93.0 


90.0 


87.0 


87.0 


87.0 


90.0 


96.0 


87.0 


90.0 


86.0 


97.0 


100.0 


1 


1 


df[df['placement exam marks'] « lower limit] 


cgpa placement exam marks placed 


Trimming 


new df = df[df['placement exam marks'] < upper limit] 


new df.shape 


(985, 3) 


# Comparing 


plt.figure(figsize=(16,8)) 
plt.subplot(2,2,1) 
sns.distplot(df['placement exam marks']) 


plt.subplot(2,2,2) 
sns.boxplot(df['placement exam marks']) 


plt.subplot(2,2,3) 
sns.distplot (new df['placement exam marks']) 


plt.subplot(2,2,4) 
sns.boxplot (new df['placement exam marks']) 


plt.show() 


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either 'displot” (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 


warnings.warn(msg, FutureWarning) 
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\ decorators.py:36: FutureWarning: Pass 
the following variable as a keyword arg: x. From version 0.12, the only valid positional a 
rgument will be “data”, and passing other arguments without an explicit keyword will resul 
t in an error or misinterpretation. 


warnings.warn( 
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either '"displot' (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 


warnings.warn(msg, FutureWarning) 
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\ decorators.py:36: FutureWarning: Pass 
the following variable as a keyword arg: x. From version 0.12, the only valid positional a 
rgument will be “data”, and passing other arguments without an explicit keyword will resul 
t in an error or misinterpretation. 


warnings.warn( 


“eua. 


0 20 40 60 80 100 
placement_exam_marks placement_exam_marks 


0 20 40 60 80 
placement_exam_marks placement_exam_marks 


Capping 


In [20]: new df cap = df.copy() 
new df cap['placement exam marks'] = np.where( 
new df cap['placement exam marks'] » upper limit, 
upper limit, 
np.where( 
new df cap['placement exam marks'] < lower limit, 
lower limit, 
new df cap['placement exam marks'] 
) 
) 
In. [22]: new df cap.shape 
1000, 
Out [22]: RODAS) 
m 22] # Comparing 
plt.figure(figsize=(16,8)) 
plt.subplot(2,2,1) 
sns.distplot(df['placement exam marks']) 
plt.subplot(2,2,2) 


sns.boxplot(df['placement exam marks']) 


.subplot(2,2,3) 
sns.distplot (new df cap['placement exam marks']) 


É 
d 


(2,2,4) 
(new df cap['placement exam marks']) 


.subplo 
sns.boxplo 


B 
d 


plt.show() 


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 


In [ ]: 


our code to use either '"displot' (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 

warnings.warn(msg, FutureWarning) 
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\ decorators.py:36: FutureWarning: Pass 
the following variable as a keyword arg: x. From version 0.12, the only valid positional a 
rgument will be “data”, and passing other arguments without an explicit keyword will resul 
t in an error or misinterpretation. 

warnings.warn( 
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either '"displot' (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 

warnings.warn(msg, FutureWarning) 
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\ decorators.py:36: FutureWarning: Pass 
the following variable as a keyword arg: x. From version 0.12, the only valid positional a 
rgument will be “data”, and passing other arguments without an explicit keyword will resul 
t in an error or misinterpretation. 

warnings.warn( 
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placement_exam_marks placement_exam_marks 
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mm — [Winsorization] - o 
l 


e 
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import numpy as np 
import pandas as pd 


df = pd.read csv('weight-height.csv') 


df.head() 


Gender Height Weight 
0 Male 73.847017 241.893563 
1 Male 68.781904 162.310473 
2 Male 74.110105 212.740856 
3 Male 71.730978 220.042470 


4 Male 69.881796 206.349801 


df.shape 


(10000, 3) 


df['Height'].describe() 


count 10000.000000 


mean 66.367560 
std 3.847528 
min 54.263133 
25% 63.505620 
50% 66.318070 
75% 69.174262 
max 78.998742 


Name: Height, dtype: float64 


import seaborn as sns 


sns.distplot (df['Height']) 


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 


our code to use either '"displot' (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 
warnings.warn(msg, FutureWarning) 
out [7]: <AxesSubplot:xlabel='Height', ylabel='Density'> 


0.08 
0.06 
Fa 
n 
c 
v 
A 004 
0.02 
0.00 
Height 
In [8]: | sns.boxplot (df ["Height"]) 
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\ decorators.py:36: FutureWarning: Pass 
the following variable as a keyword arg: x. From version 0.12, the only valid positional a 
rgument will be “data”, and passing other arguments without an explicit keyword will resul 
t in an error or misinterpretation. 
warnings.warn( 
< lot:x1 1="Hei Y» 
Out[8]: AxesSubplot:xlabe Height 
+» 
55 60 65 70 75 80 
Height 
In [9]: | upper limit = df['Height'].quantile (0.99) 
upper limit 
74.7857 
out [9]: 857900583366 
In [10]: | tower limit = df['Height'].quantile (0.01) 
lower limit 
.134411 71 
Out[10]: 58:13 58671655 
In [11]: 


new df = df[(df['Height'] <= 74.78) & (df['Height'] >= 58.13)] 


new df['Height'].describe() 


In [12]: 
count 9799.000000 
Out[12]: 
HER aas 66.363507 
std 3.644267 
min 58.134496 
25$ 63.577147 
50$ 66.317899 
758 69.119859 
max 74.767447 
Name: Height, dtype: float64 
In [13]: df['Height'].describe() 
count 10000.000000 
Out[13]: 
ULIS] aan 66.367560 
std 3.847528 
min 54.263133 
25$ 63.505620 
50$ 66.318070 
758 69.174262 
max 78.998742 
Name: Height, dtype: float64 
In [14]: sns.distplot(new df['Height']) 
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot' is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either ‘displot' (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 
warnings.warn(msg, FutureWarning) 
< lot:x1 l-'Hei L = ity'> 
Out[14]: AxesSubplot:xlabe Height ylabel-'Density 
575 600 625 650 675 700 725 750 77.5 
Height 
In [15]: 


sns.boxplot(new df['Height']) 


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\ decorators.py:36: FutureWarning: Pass 
the following variable as a keyword arg: x. From version 0.12, the only valid positional a 
rgument will be ‘data’, and passing other arguments without an explicit keyword will resul 
t in an error or misinterpretation. 

warnings.warn( 
<AxesSubplot:xlabel='Height'> 


Out[15]: 


57.5 60.0 62.5 65.0 67.5 70.0 72.5 75.0 


Height 
In [46]: # Capping --> Winsorization 
df['Height'] = np.where(df['Height'] >= upper limit, 
upper limit, 
np.where(df['Height'] <= lower limit, 
lower limit, 
df['Height'])) 
In [17]: df.shape 
Out[17]: (10000; 2) 
In [18]: df['Height'].describe() 
count 10000.000000 
Out[18]: 
UELUT can 66.366281 
std 3.795717 
min 58.134412 
25$ 63.505620 
50% 66.318070 
75% 69.174262 
max 74.785790 
Name: Height, dtype: float64 
In [19]: 


sns.distplot (df['Height']) 


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: 
distplot is a deprecated function and will be removed in a future version. Please adapt y 
our code to use either ‘displot' (a figure-level function with similar flexibility) or ‘hi 
stplot' (an axes-level function for histograms). 

warnings.warn(msg, FutureWarning) 
<AxesSubplot:xlabel='Height', ylabel='Density'> 


Out [19]: 


0.08 


0.00 
57.5 600 625 650 675 700 725 750 775 
Height 


In [20]: | sns.boxplot (ae T Beige 1) 


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\ decorators.py:36: FutureWarning: Pass 
the following variable as a keyword arg: x. From version 0.12, the only valid positional a 


rgument will be ‘data’, and passing other arguments without an explicit keyword will resul 
t in an error or misinterpretation. 
warnings.warn( 


«A 1 2x] 1='Height'> 
out[20]: xesSubplot:xlabe eight 


57.5 60.0 62.5 65.0 67.5 70.0 72.5 75.0 
Height 
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A 


/ Ne m m m m m m m m L 
1 1 Native attributes! | 


Length 


UM / 


1 
Choice of 
rule 


import numpy as np 
import pandas as pd 


= + of the root table > = 


Y 


D la e 
^ , Construction 
4 rules 


9 
L 


Feature Construction 


! Secondary 


/ tables 
A 


Time 


from sklearn.model selection import cross val score 
from sklearn. linear model import LogisticRegression 


import seaborn as sns 


df = pd.read csv('train.cosv')[['Age',"'Polass',' SibSp','Parch', 


df.head() 


Age Pclass SibSp Parch Survived 


0 220 3 1 0 
1 380 1 1 0 
2 260 3 0 0 
3 350 1 1 0 
4 350 3 0 0 


df.dropna (inplace=True) 


df.head() 


0 


1 


Age Pclass SibSp Parch Survived 


0 220 3 1 0 


1 


38.0 1 1 0 


0 


1 


Derivative 
value 


Double 


derivative val 


'Survived']] 


Age Pclass SibSp Parch Survived 


2 260 3 0 0 1 
3 350 1 1 0 1 
4 350 3 0 0 0 


y = df.iloc[sy 1] 


X.head() 


Age Pclass SibSp Parch 


0 220 3 1 0 
1 380 1 1 0 
2 260 3 0 0 
3 350 1 1 0 
4 350 3 0 0 


np.mean(cross val score (LogisticRegression(),X,y,scoring="'accuracy',cv=20)) 


0.6933333333333332 
Applying Feature Construction 
X['Family size'] = X['SibSp'] + X['Parch'] + 1 


X.head() 


Age Pclass SibSp Parch Family size 


0 220 3 1 0 2 
1 380 1 1 0 2 
2 260 3 0 0 1 
3 350 1 1 0 2 
4 350 3 0 0 1 


def myfunc (num): 

if num == 
falone 
return 0 

elif num »1 and num <=4: 
# small family 
return 1 

else: 


# large family 
return 2 


myfunc(4) 


X['Family type'] = X['Family size'].apply (myfunc) 


X.head() 


Age Pclass SibSp Parch Family size Family type 


0 220 3 1 0 2 1 

1 380 1 1 0 2 1 

2 260 3 0 0 1 0 

3 350 1 1 0 2 1 

4 350 3 0 0 1 0 
X.drop(columns=['SibSp','Parch', 'Family size'],inplace-True) 
X.head() 


Age Pclass Family type 


0 220 3 1 
1 380 1 1 
2 260 3 0 
3 350 1 1 
4 350 3 0 


np.mean(cross val score(LogisticRegression(),X,y,scoring='accuracy',cv=20)) 


0.7003174603174602 


Feature e 
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A 


Feature Splitting 


from sklearn.model selection import cross val score 
from sklearn.linear model import LogisticRegression 


import seaborn as sns 


df = pd.read csv('train.csv') 


df .head() 
Passengerld Survived  Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 
Braund, Mr. A/5 
0 1 0 3 Owen Harris male 22.0 1 0 21171 7.2500 NaN S 
Cumings, 
Mrs. John 
1 2 1 1 Bradley female 38.0 1 0 PC17599 71.2833 C85 Ç 
(Florence 
Briggs Th... 
Heikkinen, STON/O2. 
2 3 1 3 Miss. Laina female 26.0 0 0 3101282 7.9250 NaN S 
Futrelle, 
Mrs. 
3 4 1 1 Jacques female 35.0 1 0 113803 53.1000 C123 S 
Heath (Lily 
May Peel) 
Allen, Mr. 
4 5 0 3 William male 35.0 0 0 373450 8.0500 NaN 5 
Henry 
df['Name'] 
0 Braund, Mr. Owen Harris 
1 Cumings, Mrs. John Bradley (Florence Briggs Th... 
2 Heikkinen, Miss. Laina 
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) 
4 Allen, Mr. William Henry 
886 Montvila, Rev. Juozas 
887 Graham, Miss. Margaret Edith 
888 Johnston, Miss. Catherine Helen "Carrie" 
889 Behr, Mr. Karl Howell 
890 Dooley, Mr. Patrick 
Name: Name, Length: 891, dtype: object 
df['Name'].str.split(', ', expand=True) [1].str.split('.', expand=True) [0] 
0 Mr 
1 Mrs 
2 Miss 
3 Mrs 
4 Mr 
886 Rev 
887 Miss 
888 Miss 


890 Mr 
Name: 0, Length: 891, dtype: object 


df['Title'] = df['Name'].str.split(', ', expand=True) [1] .str.split('.', expand=True) [0] 


df['Name'].str.split(', ', expand=True) [1] .str.split('.', expand=True) [0] 


0 Mr 
1 Mrs 
2 Miss 
3 Mrs 
4 Mr 
886 Rev 
887 Miss 
888 Miss 
889 Mr 
890 Mr 


Name: 0, Length: 891, dtype: object 


df[['Title','Name']] 


Title Name 
0 Mr Braund, Mr. Owen Harris 


1 Mrs Cumings, Mrs. John Bradley (Florence Briggs Th... 


2 Miss Heikkinen, Miss. Laina 
3 Mrs Futrelle, Mrs. Jacques Heath (Lily May Peel) 
4 Mr Allen, Mr. William Henry 
886 Rev Montvila, Rev. Juozas 
887 Miss Graham, Miss. Margaret Edith 
888 Miss Johnston, Miss. Catherine Helen "Carrie" 
889 Mr Behr, Mr. Karl Howell 
890 Mr Dooley, Mr. Patrick 


891 rows x 2 columns 


(df.groupby('Title').mean()['Survived']).sort values(ascending=False) 


Title 
the Countess 1.000000 
lle 1.000000 
Sir 1.000000 
S 1.000000 
Lady 1.000000 
me 1.000000 
rs 0.792000 
iss 0.697802 
aster 0.575000 
Col 0.500000 
ajor 0.500000 
Dr 0.428571 


Mr 0.156673 
Jonkheer 0.000000 
Rev 0.000000 
Don 0.000000 
Capt 0.000000 


Name: Survived, dtype: float64 


df['Is Married'] = 0 
df['Is Married'].loc[df['Title'] == 'Mrs'] = 1 


C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py:1732: SettingWithCopyWa 
rning: 
A value is trying to be set on a copy of a slice from a DataFrame 


See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user gu 
ide/indexing.html#returning-a-view-versus-a-copy 
self. setitem single block(indexer, value, name) 


df['Is Married'] 


0 0 
1 I 
2 0 
3 i 
4 0 
886 0 
887 0 
888 0 
889 0 
890 0 
Name: Is Married, Length: 891, dtype: int64 
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A 


Binarization 


import matplotlib.pyplot as plt 


from sklearn.model selection import train test split 
from sklearn.tree import DecisionTreeClassifier 


from sklearn.metrics import accuracy score 
from sklearn.model selection import cross val score 


A 


from sklearn.preprocessing import KBinsDiscretizer 
from sklearn.compose import ColumnTransformer 


A 


df = pd.read_csv('train.csv',usecols=['Age','Fare', 'Survived']) 


df .dropna(inplace=True) 


df.shape 
(714, 3) 
df.head() 

Survived Age Fare 
0 0 220 7.2500 
1 1 38.0 71.2833 
2 1 260 7.9250 
3 1 35.0 53.1000 
4 O 35.0 8.0500 


< 
[ 
fo) 
Fh 
tae 
O 
a 
[ee] 


X train,X test,y train,y test = train test split(X,y,test size=0.2,random state=42) 


X train.head(2) 


Age Fare 
328 31.0 20.5250 


73 26.0 14.4542 


clf = DecisionTreeClassifier() 


clf.fit(X train,y train) 
y pred = clf.predict(X test) 


accuracy score(y test,y pred) 


0.6433566433566433 


np.mean(cross val score(DecisionTreeClassifier(),X,y,cv=10,scoring='accuracy')) 


0.6289319248826291 


kbin age = KBinsDiscretizer(n bins=15,encode='ordinal',strategy='quantile') 
kbin fare = KBinsDiscretizer(n bins=15,encode='ordinal',strategy='quantile') 


trf = ColumnTransformer ( [ 
('first',kbin age, [0]), 
('second',kbin fare, [1]) 


1) 


X train trf = trf.fit transform(X train) 
X test trf = trf.transform(X test) 


trf.named transformers ['first'].bin edges 


array([array([ 0:42, ^O. e l6. q 19. à 21. y 23. q 254 q 285 p 30. y 
32. 7 DD q 380 p 422 p 47. p 54. p 80. 1) ls 
dtype=object) 


trf.named transformers ['first'].bin edges . 


array([array([ 0:42; 6, p 164 p 194 q 21. p 23. p 25. cp 28. q 304 q 
324 q 354 cp 382 jo 42% p 472 q 544 p 802 1) 1; 
dtype=object) 


output = pd.DataFrame (( 
'age':X train['Age'], 
"age EPF'sX- train trfIli,0], 
"fare':X train['Fàre'], 
"fare trf':X train tr£l:,1] 
}) 


output['age labels'] = pd.cut(x=X train['Age'], 
bins=trf.named transformers ['first'].bin edges [0].tc 
output['fare labels'] = pd.cut(x-X train['Fare'], 


bins=trf.named transformers ['second'].bin edges [0].!t 


output.sample (5) 


age age trf fare fare trf age labels fare labels 
821 27.0 6.0 8.6625 4.0 (25.0, 28.0] (8.158, 10.5] 
230 35.0 10.0 83.4750 13.0 (32.0,35.0] (76.292, 108.9] 
784 250 6.0 7.0500 0.0 (23.0, 25.0] (0.0, 7.25] 


660 50.0 13.0 133.6500 14.0 (47.0, 54.0] (108.9, 512.329] 


age age trf fare fare trf age labels fare labels 


621 420 120 52.5542 12.0 (38.0,42.0] (51.479, 76.292] 


clf = DecisionTreeClassifier() 
clf.fit(X train trf,y train) 
y pred2 = clf.predict(X test trf) 


accuracy score(y test,y pred2) 

0.6363636363636364 

X trf = trf.fit transform(X) 

np.mean(cross val score(DecisionTreeClassifier(),X,y,cv=10,scoring='accuracy')) 
0.6303012519561815 

def discretize(bins,strategy): 


kbin age = KBinsDiscretizer(n bins=bins,encode='ordinal',strategy=strategy) 
kbin fare = KBinsDiscretizer(n_bins=bins, encode='ordinal',strategy=strategy) 


trf = ColumnTrans former (L 
('first',kbin age, [0] 
( 


), 
'second',kbin fare, [1]) 


X trf = trf.fit transform(X) 
print (np.mean(cross val score(DecisionTreeClassifier(),X, y  cv=10,scoring='accuracy'))) 
plt.figure(figsize=(14,4)) 
plt.subplot (121) 
plt.hist(X['Age']) 
plt.title("Before") 
plt.subplot(122) 

plt.hist(X trf[:,0],colorz'red') 
plt.title("After") 

plt.show() 
plt.figure(figsize=(14,4)) 
plt.subplot (121) 
plt.hist(X['Fare']) 
plt.title("Before") 
plt.subplot(122) 

plt.hist(X trf[:,1],colorz'red') 
plt.title("Fare") 

plt.show() 


discretize(5, 'kmeans') 


0.6288928012519561 
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import numpy as np 
import pandas as pd 
np.random. seed (23) 
mu_vecl = np.array([0,0,0]) 
cov mati = np.array([[1,0,0],[0,1,0], [0,0,1]]) 
classl sample = np.random.multivariate normal (mu vecl, cov matl, 20) 
df = pd.DataFrame(classl sample, columns=['featurel','feature2','feature3']) 
df['target'] = 1 
mu_vec2 = np.array([1,1,1]) 
cov mat2 = np.array([[1,0,0],[0,1,0], [0,0,1]]) 
class2 sample = np.random.multivariate normal (mu_vec2, cov_mat2, 20) 
dfl = pd.DataFrame (class2 sample, columns=['featurel','feature2', 'feature3']) 


dfl['target'] = 0 


df = df .append (df1, ignore index=True) 


df = df.sample (40) 


df.head() 


feature!  feature2 feature3 target 


2 -0.367548 -1.137460 -1.322148 1 


feature! feature2  feature3 target 


34 0.177061 -0.598109 1.226512 0 
14 0.420623 0.411620 -0.071324 1 
11 1.968435 -0.547788 -0.679418 1 
12 -2.506230 0.146960 0.606195 1 


import plotly.express as px 
fy train trf = y train.astype(str) 
fig = px.scatter 3d(df, x=df['featurel'], y=df['feature2'], z=df['feature3'], 
colorzdf['target'].astype('str')) 
fig.update traces (marker=dict (size=12, 
line=dict (widthz2, 
color='DarkSlateGrey')), 
selector=dict (mode='markers')) 


fig.show() 
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# Step 1 - Apply standard scaling 
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler() 


df .iloc[3,053].:= scaler, fit transtorm(df.. Lloc [7073]: 


# Step 2 - Find Covariance Matrix 
covariance matrix = np.cov([df.iloc[:,0],df.iloc[:,1],df.iloc[:,2]]) 
print('Covariance Matrix: Mn', covariance matrix) 


Covariance Matrix: 
[[1.02564103 0.20478114 0.080118 ] 
[0.20478114 1.02564103 0.19838882] 
[0.080118 0.19838882 1.02564103]] 


# Step 3 - Finding EV and EVs 
eigen values, eigen vectors - np.linalg.eig(covariance matrix) 


eigen values 


array([1.3536065 , 0.94557084, 0.77774573]) 


eigen vectors 


array([[-0.53875915, -0.69363291, 0.47813384], 
[-0.65608325, -0.01057596, -0.75461442], 
[-0.52848211, 0.72025103, 0.44938304]]) 


$pylab inline 


from matplotlib import pyplot as plt 

from mpl toolkits.mplot3d import Axes3D 

from mpl toolkits.mplot3d import proj3d 

from matplotlib.patches import FancyArrowPatch 


class Arrow3D(FancyArrowPatch): 
def init (self, xs, ys, zs, *args, **kwargs): 
FancyArrowPatch. init (self, (0,0), (0,0), *args, **kwargs) 
self. verts3d = xs, ys, zs 


def draw(self, renderer): 
xs3d, ys3d, zs3d = self. verts3d 


XS, yS, ZS = proj3d.proj transform(xs3d, ys3d, zs3d, renderer.M) 


self.set positions ((xs[0],ys[0]) , (xs[1],ys[1])) 
FancyArrowPatch.draw(self, renderer) 


fig = plt.figure(figsizez(7,7)) 
ax = fig.add subplot (111, projectionz'3d') 


color='blue', 


ax.plot (df ['featurel'], df['feature2'], df['feature3'], 'o', markersize=8, 
ax.plot([df['featurel'].mean()], [df ['feature2'].mean()], [df ['feature3'].mean()], 


for v in eigen vectors.T: 
a = Arrow3D([df['featurel'].mean(), v[0]], [df['feature2'].mean(), 
ax.add artist (a) 


ax.set xlabel('x values') 
ax.set ylabel('y values') 
ax.set zlabel('z values') 
plt.title('Eigenvectors') 
plt.show() 


Populating the interactive namespace from numpy and matplotlib 


[df [' feature 


C:\Users\HP\AppData\Local\Temp/ipykernel 88008/3713440988.py:16: MatplotlibDeprecationWarn 


ing: 


The M attribute was deprecated in Matplotlib 3.4 and will be removed two minor releases la 
ter. Use self.axes.M instead. 


Eigenvectors 


Z values 


pc = eigen vectors[0:2] 


array([[-0.53875915, -0.69363291, 0.47813384], 
[-0.65608325, -0.01057596, -0.75461442]]) 


transformed df = np.dot(df.iloc[:,0:3],pc.T) 

# 40,3 - 3,2 

new df = pd.DataFrame (transformed df,columns=["PC1","'PC2"]) 
new df['target'] = df['target'].values 

new df.head() 


PC1 PC2 target 
0 0.599433 1.795862 1 
1 23056919 -0.212737 0 
2 -0.271876 0.498222 1 
3 -0.621586 0.023110 1 
4 1.567286 1.730967 1 
new df['target'] = new df['target'].astype('str') 


fig = px.scatter (x=new df['PC1'], 
y=new df['PC2'], 
color=new df['target'], 
color discrete sequence=px.colors.qualitative.G10 


) 


fig.update traces (markerzdict (size=12, 


In [ ]: 


line=dict (widthz2, 
color='DarkSlateGrey')), 
selector=dict (mode='markers')) 


fig.show() 
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Show In-depth praticle : 
1. https://www.kaggle.com/code/kanav0183/pca-analysis-for-geneclassification 
2. https://www.kaggle.com/code/sid321axn/principal-component-analysis-pca 


3. https://www.kaggle.com/code/faressayah/support-vector-machine-pca-tutorial-for- 
beginner#4.-Principal-Component-Analysis (Special) 


4. https://towardsdatascience.com/principal-component-analysis-pca-explained-visually- 
with-zero-math-1cbf392b9e7d (Medium) 


